renamed and updated
This commit is contained in:
parent
8c767b5235
commit
a1e69b1368
126
html_parse.py
Normal file
126
html_parse.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tqdm import tqdm
|
||||||
|
from loguru import logger
|
||||||
|
import string
|
||||||
|
import shutil
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
|
||||||
|
def roman_to_int(number: str) -> int:
|
||||||
|
if number is None:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||||||
|
total = 0
|
||||||
|
for i in range(len(number)-1,-1,-1):
|
||||||
|
num = roman[number[i]]
|
||||||
|
if 3*num < total:
|
||||||
|
total -= num
|
||||||
|
else:
|
||||||
|
total += num
|
||||||
|
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
# File rename...
|
||||||
|
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
|
||||||
|
# The text within the '[]' should be the name of the file.
|
||||||
|
def get_filename(soup: BeautifulSoup) -> tuple:
|
||||||
|
title = soup.title.string
|
||||||
|
title = (title
|
||||||
|
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
|
||||||
|
.replace("].", ''))
|
||||||
|
|
||||||
|
*book, _ = title.replace('Chapter', '').split()
|
||||||
|
book = ' '.join(book)
|
||||||
|
book = book.strip(string.punctuation).title()
|
||||||
|
book = (
|
||||||
|
book.replace('First', '1').
|
||||||
|
replace('Second', '2').
|
||||||
|
replace('Third', '3')
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'Introduction' in title:
|
||||||
|
roman = ''
|
||||||
|
elif 'CHAPTER' in title.upper():
|
||||||
|
*_, roman = title.upper().split('CHAPTER')
|
||||||
|
else:
|
||||||
|
*_, roman = title.split()
|
||||||
|
|
||||||
|
num = roman_to_int(roman.strip())
|
||||||
|
file = f'{book} {num}' if num else book
|
||||||
|
|
||||||
|
return book.strip(), file.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
|
||||||
|
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
||||||
|
# print(md_file)
|
||||||
|
md_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
md_file.touch(exist_ok=True)
|
||||||
|
return md_file
|
||||||
|
|
||||||
|
|
||||||
|
def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
|
||||||
|
# def get_string(block):
|
||||||
|
# if list(block.decendents) > 1:
|
||||||
|
# for i in block.decendents:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for block in soup.find_all('p'):
|
||||||
|
for text in block.stripped_strings:
|
||||||
|
print(repr(text))
|
||||||
|
# for tag in block:
|
||||||
|
# if tag is None:
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# if not list(tag.children):
|
||||||
|
# print(tag)
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# for s in tag.children:
|
||||||
|
# print(s)
|
||||||
|
#
|
||||||
|
# for block in soup.find_all('p'):
|
||||||
|
# print(list(block.childGenerator()))
|
||||||
|
# # match tag.name:
|
||||||
|
# case None:
|
||||||
|
# print(tag)
|
||||||
|
#
|
||||||
|
# case 'b':
|
||||||
|
# print('\t', tag.name)
|
||||||
|
# print('\t\t', tag)
|
||||||
|
# # file.write(f"**{tag.string}**")
|
||||||
|
#
|
||||||
|
# case 'i':
|
||||||
|
# print('\t', tag.name)
|
||||||
|
# print('\t\t', tag)
|
||||||
|
# # file.write(f"*{tag.string}*")
|
||||||
|
#
|
||||||
|
# case _:
|
||||||
|
# print('\t', tag.name)
|
||||||
|
# print('\t\t', tag)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# file.write('\n')
|
||||||
|
#
|
||||||
|
# file.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
|
||||||
|
|
||||||
|
htm_files = list(folder.glob('*.HTM'))
|
||||||
|
output_folder = Path('./Matthew Henry Commentary').absolute()
|
||||||
|
# for file in tqdm(htm_files[7:8]):
|
||||||
|
for file in htm_files[7:8]:
|
||||||
|
soup = BeautifulSoup(file.read_text(), 'html.parser')
|
||||||
|
|
||||||
|
book, chapter = get_filename(soup)
|
||||||
|
md_file = create_md_file(output_folder, book, chapter)
|
||||||
|
|
||||||
|
write_blocks(soup, md_file)
|
421
main.py
421
main.py
@ -1,17 +1,55 @@
|
|||||||
import time
|
import re
|
||||||
from pathlib import Path
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from tqdm import tqdm
|
|
||||||
from loguru import logger
|
|
||||||
import string
|
import string
|
||||||
import shutil
|
import sys
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pythonbible as bible
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from loguru import logger
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
base = Path('./Matthew Henry Commentary/xml').absolute()
|
||||||
|
|
||||||
|
logger.remove(0)
|
||||||
|
logger.add('log.txt')
|
||||||
|
|
||||||
|
class MyHTMLParser(HTMLParser):
|
||||||
|
file = None
|
||||||
|
passage_verse = None
|
||||||
|
tag = None
|
||||||
|
tag_type = None # 'start' or 'end'
|
||||||
|
attrs = dict()
|
||||||
|
|
||||||
|
# All the patterns are for re.sub(). This is specifically for getting
|
||||||
|
# list/sublist into the correct position. The first item is for the pattern
|
||||||
|
# to find and the second is the replacement pattern. The replacement pattern
|
||||||
|
# also ids the list so that when the self.clean_file() we can properly add
|
||||||
|
# the correct amount of `\t` to the line.
|
||||||
|
patterns = [
|
||||||
|
(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
|
||||||
|
(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
|
||||||
|
|
||||||
|
# These share the same id. There are times were the two are a mixed
|
||||||
|
# within the parent lists.
|
||||||
|
(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
|
||||||
|
(r"(\S\s)(\d+\.)(\s\b|\W)",
|
||||||
|
lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
|
||||||
|
|
||||||
|
|
||||||
|
(r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
|
||||||
|
lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def roman_to_int(number: str) -> int:
|
def roman_to_int(number: str) -> int:
|
||||||
if number is None:
|
if number is None:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
# Sometimes the roman numeral comes with a '.'. Striping to not cause
|
||||||
|
# issues
|
||||||
|
number = number.strip('.')
|
||||||
|
|
||||||
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||||||
total = 0
|
total = 0
|
||||||
for i in range(len(number) - 1, -1, -1):
|
for i in range(len(number) - 1, -1, -1):
|
||||||
@ -23,104 +61,309 @@ def roman_to_int(number: str) -> int:
|
|||||||
|
|
||||||
return total
|
return total
|
||||||
|
|
||||||
|
def create_md_file(self, base_dir: Path, folder: str, file: str):
|
||||||
|
""" Create the path of the md file that will be written to as the
|
||||||
|
program process the xml file.
|
||||||
|
"""
|
||||||
|
|
||||||
# File rename...
|
|
||||||
# <TITLE>Matthew Henry's Complete Commentary on the Whole Bible [Genesis, Chapter II].</TITLE>
|
|
||||||
# The text within the '[]' should be the name of the file.
|
|
||||||
def get_filename(soup: BeautifulSoup) -> tuple:
|
|
||||||
title = soup.title.string
|
|
||||||
title = (title
|
|
||||||
.replace("Matthew Henry's Complete Commentary on the Whole Bible [", '')
|
|
||||||
.replace("].", ''))
|
|
||||||
|
|
||||||
*book, _ = title.replace('Chapter', '').split()
|
|
||||||
book = ' '.join(book)
|
|
||||||
book = book.strip(string.punctuation).title()
|
|
||||||
book = (
|
|
||||||
book.replace('First', '1').
|
|
||||||
replace('Second', '2').
|
|
||||||
replace('Third', '3')
|
|
||||||
)
|
|
||||||
|
|
||||||
if 'Introduction' in title:
|
|
||||||
roman = ''
|
|
||||||
elif 'CHAPTER' in title.upper():
|
|
||||||
*_, roman = title.upper().split('CHAPTER')
|
|
||||||
else:
|
|
||||||
*_, roman = title.split()
|
|
||||||
|
|
||||||
num = roman_to_int(roman.strip())
|
|
||||||
file = f'{book} {num}' if num else book
|
|
||||||
|
|
||||||
return book.strip(), file.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def create_md_file(base_dir: Path, folder: str, file: str) -> Path:
|
|
||||||
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
||||||
# print(md_file)
|
|
||||||
md_file.parent.mkdir(parents=True, exist_ok=True)
|
md_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
md_file.touch(exist_ok=True)
|
md_file.unlink(missing_ok=True)
|
||||||
|
# print(md_file.absolute())
|
||||||
|
self.file = md_file
|
||||||
return md_file
|
return md_file
|
||||||
|
|
||||||
|
def write_to_file(self, data):
|
||||||
|
|
||||||
def write_blocks(soup: BeautifulSoup, md_file: Path) -> None:
|
if self.file is None:
|
||||||
# def get_string(block):
|
raise ValueError('No File specified')
|
||||||
# if list(block.decendents) > 1:
|
|
||||||
# for i in block.decendents:
|
|
||||||
|
|
||||||
|
# Here the list/sublist are searched for and altered for later
|
||||||
|
# processing in self.clean_file()
|
||||||
|
for pattern, sub in self.patterns:
|
||||||
|
# print(f'Running Pattern: {pattern}')
|
||||||
|
data = re.sub(pattern, sub, data)
|
||||||
|
|
||||||
|
# print(f'writing: {data!r}')
|
||||||
|
|
||||||
|
with open(self.file, 'a+') as file:
|
||||||
|
file.write(data)
|
||||||
|
|
||||||
|
def clean_file(self):
|
||||||
|
logger.info(' CLEANING FILE')
|
||||||
|
|
||||||
|
if self.file is None:
|
||||||
|
raise ValueError('No File specified')
|
||||||
|
|
||||||
|
# Read all the lines
|
||||||
|
with open(self.file, 'r') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
indent = 0 # Current number of '\t' to add in front of the list number
|
||||||
|
sublist = dict() # Track the indent number based on the id.
|
||||||
|
with open(self.file, 'w') as file:
|
||||||
|
file.write('# Chapter Introduction\n\n')
|
||||||
|
|
||||||
|
header = None
|
||||||
|
for line in lines:
|
||||||
|
# Do not write blank lines
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Main header, if a header has already been written skip it
|
||||||
|
if line.strip().startswith('# '):
|
||||||
|
if header == line.strip():
|
||||||
|
continue
|
||||||
|
header = line.strip()
|
||||||
|
|
||||||
|
# Add a `\n` if line is a heading.
|
||||||
|
if line.startswith('#'):
|
||||||
|
line = f'\n{line.strip()}'
|
||||||
|
|
||||||
|
# If line is not a list then add a `\n` after the paragraph,
|
||||||
|
# and reset the indent tracker
|
||||||
|
if line.strip()[0] not in string.digits:
|
||||||
|
line = f'{line}\n'
|
||||||
|
# print('resetting indent')
|
||||||
|
indent = 0
|
||||||
|
sublist.clear()
|
||||||
|
else:
|
||||||
|
# Get the list id
|
||||||
|
start = line.find('~')
|
||||||
|
end = start + 2
|
||||||
|
list_id = line[start:end]
|
||||||
|
|
||||||
|
# Get the number of indents based on the list id, if it does
|
||||||
|
# not exist they set the value to the max value +1. If this
|
||||||
|
# is the first it will be set to 0 (-1 +1 = 0)
|
||||||
|
indents = "\t" * sublist.setdefault(
|
||||||
|
list_id, max(sublist.values(), default=-1)+1
|
||||||
|
)
|
||||||
|
line = f'{indents}{line.replace(list_id, "")}'
|
||||||
|
|
||||||
|
if line.startswith('Passage: '):
|
||||||
|
line = line.removeprefix('Passage: ')
|
||||||
|
|
||||||
|
file.write(line)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def normalize_osis_verses(verses):
|
||||||
|
"""Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
|
||||||
|
and turns it into somthing more readable"""
|
||||||
|
normalized_refs = list()
|
||||||
|
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||||
|
spanned_verses = list()
|
||||||
|
for ref in chunks.split('-'):
|
||||||
|
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
||||||
|
spanned_verses.append(
|
||||||
|
bible.format_scripture_references(
|
||||||
|
bible.get_references(verse.strip())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
normalized = bible.format_scripture_references(
|
||||||
|
bible.get_references('-'.join(spanned_verses))
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
|
||||||
|
raise
|
||||||
|
normalized_refs.append(normalized)
|
||||||
|
|
||||||
|
return ';'.join(normalized_refs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def obsidian_links(verses):
|
||||||
|
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
|
||||||
|
links = []
|
||||||
|
for verse in verses.split(';'):
|
||||||
|
links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
|
||||||
|
|
||||||
|
return '; '.join(links)
|
||||||
|
|
||||||
|
def convert_passage(self, verses):
|
||||||
|
book = self.file.parts[-2]
|
||||||
|
chapter = None
|
||||||
|
ref = []
|
||||||
|
for ver in verses.split(','):
|
||||||
|
try:
|
||||||
|
chapter, v = ver.split(':')
|
||||||
|
except:
|
||||||
|
v = ver
|
||||||
|
|
||||||
|
r = []
|
||||||
|
for i in v.split('-'):
|
||||||
|
r.append(f'{book}.{chapter}.{i}')
|
||||||
|
|
||||||
|
ref.append(f'Bible:{"-".join(r)}')
|
||||||
|
x = ' '.join(ref)
|
||||||
|
return f'osisRef="{x}"'
|
||||||
|
|
||||||
|
def process_tag(self, test=None):
|
||||||
|
match self.tag:
|
||||||
|
case 'b': # Bold
|
||||||
|
self.write_to_file('**')
|
||||||
|
|
||||||
|
case 'i': # Italics
|
||||||
|
self.write_to_file('*')
|
||||||
|
|
||||||
|
case 'scripcom':
|
||||||
|
# This saves the verse for the section.
|
||||||
|
if self.tag_type == 'start':
|
||||||
|
if self.attrs['type'] == 'Commentary':
|
||||||
|
self.passage_verse = self.attrs['osisref']
|
||||||
|
|
||||||
|
case 'h4': # this is the section header. Need to keep track of it per file.
|
||||||
|
if self.tag_type == 'start':
|
||||||
|
self.write_to_file('\n# ')
|
||||||
|
|
||||||
|
if self.tag_type == 'end':
|
||||||
|
self.write_to_file('\n')
|
||||||
|
|
||||||
for block in soup.find_all('p'):
|
|
||||||
for text in block.stripped_strings:
|
|
||||||
print(repr(text))
|
|
||||||
# for tag in block:
|
|
||||||
# if tag is None:
|
|
||||||
# continue
|
|
||||||
#
|
|
||||||
# if not list(tag.children):
|
|
||||||
# print(tag)
|
|
||||||
# continue
|
|
||||||
#
|
|
||||||
# for s in tag.children:
|
|
||||||
# print(s)
|
|
||||||
#
|
|
||||||
# for block in soup.find_all('p'):
|
|
||||||
# print(list(block.childGenerator()))
|
|
||||||
# # match tag.name:
|
|
||||||
# case None:
|
|
||||||
# print(tag)
|
|
||||||
#
|
|
||||||
# case 'b':
|
|
||||||
# print('\t', tag.name)
|
|
||||||
# print('\t\t', tag)
|
|
||||||
# # file.write(f"**{tag.string}**")
|
|
||||||
#
|
|
||||||
# case 'i':
|
|
||||||
# print('\t', tag.name)
|
|
||||||
# print('\t\t', tag)
|
|
||||||
# # file.write(f"*{tag.string}*")
|
|
||||||
#
|
#
|
||||||
|
if self.passage_verse:
|
||||||
|
try:
|
||||||
|
verse = bible.get_references(
|
||||||
|
self.normalize_osis_verses(
|
||||||
|
self.passage_verse)
|
||||||
|
)[0]
|
||||||
|
except:
|
||||||
|
logger.debug(self.passage_verse)
|
||||||
|
raise
|
||||||
|
|
||||||
|
self.write_to_file('## Verses: ')
|
||||||
|
self.write_to_file(
|
||||||
|
f'{verse.start_verse} - {verse.end_verse}'
|
||||||
|
)
|
||||||
|
self.write_to_file('\n\n')
|
||||||
|
|
||||||
|
case 'scripref': # Scripture ref
|
||||||
|
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
||||||
|
# bible.format_scripture_references(bible.get_references(attrs['osisref']))
|
||||||
|
if self.tag_type == 'start':
|
||||||
|
if 'osisref' in self.attrs:
|
||||||
|
verses = self.normalize_osis_verses(
|
||||||
|
self.attrs['osisref']
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
verses = self.normalize_osis_verses(
|
||||||
|
self.convert_passage(self.attrs['passage'])
|
||||||
|
)
|
||||||
|
|
||||||
|
self.write_to_file(self.obsidian_links(verses))
|
||||||
|
|
||||||
|
case 'p': # Paragraph
|
||||||
|
if self.tag_type == 'start':
|
||||||
|
if self.attrs.get('class', False) == 'passage':
|
||||||
|
# Need this do the regex in self.clean does not pick
|
||||||
|
# this up. Will be also cleaned in the same function.
|
||||||
|
self.write_to_file('Passage: ')
|
||||||
|
if self.tag_type == 'end':
|
||||||
|
self.write_to_file('\n\n')
|
||||||
|
|
||||||
|
case 'li':
|
||||||
|
if self.tag_type == 'start':
|
||||||
|
self.write_to_file('- ')
|
||||||
|
else:
|
||||||
|
self.write_to_file('\n')
|
||||||
|
|
||||||
# case _:
|
# case _:
|
||||||
# print('\t', tag.name)
|
# # print(tag.upper())
|
||||||
# print('\t\t', tag)
|
# pass
|
||||||
#
|
|
||||||
#
|
|
||||||
# file.write('\n')
|
|
||||||
#
|
|
||||||
# file.write('\n')
|
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
# Set taf trackers
|
||||||
|
self.tag_type = 'start'
|
||||||
|
self.tag = tag
|
||||||
|
self.attrs = dict(attrs)
|
||||||
|
|
||||||
|
self.process_tag()
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
# print(f'End: {tag}')
|
||||||
|
self.tag = tag
|
||||||
|
self.tag_type = 'end'
|
||||||
|
self.process_tag()
|
||||||
|
|
||||||
|
# Clear tag tracker
|
||||||
|
self.tag = None
|
||||||
|
self.tag_type = None
|
||||||
|
self.attrs.clear()
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
# print(f'Data: {data!r}')
|
||||||
|
|
||||||
|
if self.tag in ['scripref', 'h2', 'h3', 'h5']:
|
||||||
|
return
|
||||||
|
|
||||||
|
if data == '\n':
|
||||||
|
return
|
||||||
|
|
||||||
|
data = data.replace('—', '-- ')
|
||||||
|
data = data.replace('GOD', 'God')
|
||||||
|
|
||||||
|
self.write_to_file(data.replace('\n', ' '))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
folder = Path('/home/bear/PycharmProjects/mark_henry_md/matthew_henry')
|
parser = MyHTMLParser()
|
||||||
|
|
||||||
htm_files = list(folder.glob('*.HTM'))
|
# Process each volume file.
|
||||||
output_folder = Path('./Matthew Henry Commentary').absolute()
|
pbar = tqdm(total=1273)
|
||||||
# for file in tqdm(htm_files[7:8]):
|
for vol in sorted(Path('./').glob('*.xml')):
|
||||||
for file in htm_files[7:8]:
|
logger.info(vol)
|
||||||
soup = BeautifulSoup(file.read_text(), 'html.parser')
|
soup = BeautifulSoup(vol.read_text(), 'xml')
|
||||||
|
|
||||||
book, chapter = get_filename(soup)
|
# Get each book in the volume
|
||||||
md_file = create_md_file(output_folder, book, chapter)
|
# This will be the main folder for all the book's chapters
|
||||||
|
for book in soup.find_all('div1'):
|
||||||
|
logger.info(book['title'])
|
||||||
|
book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
|
||||||
|
|
||||||
write_blocks(soup, md_file)
|
# These are the chapters/files for each book folder
|
||||||
|
for chapter in book.find_all('div2'):
|
||||||
|
logger.info(chapter['title'])
|
||||||
|
|
||||||
|
filename = chapter['title']
|
||||||
|
|
||||||
|
# Remove Roman Numerals from file name
|
||||||
|
if 'CHAPTER' in filename.upper():
|
||||||
|
_, roman_num = chapter['title'].split(' ')
|
||||||
|
filename = f'Chapter {parser.roman_to_int(roman_num)}'
|
||||||
|
|
||||||
|
parser.create_md_file(base, book_name, filename)
|
||||||
|
|
||||||
|
pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
|
||||||
|
|
||||||
|
# Parse the text of the chapter
|
||||||
|
# May need to add a loop to look in all <p class='passage> and
|
||||||
|
# unrap any scripref that is in them. the place Dan is getting confused
|
||||||
|
# with the book Daniel.
|
||||||
|
for passage in chapter.find_all('p', {'class': 'passage'}):
|
||||||
|
# print(passage.text)
|
||||||
|
new_t = soup.new_tag('p', attrs=passage.attrs)
|
||||||
|
new_t.string = passage.text
|
||||||
|
passage.replace_with(new_t)
|
||||||
|
|
||||||
|
# Clearing all scripRef of internal text. That way if it
|
||||||
|
# had other tags in it would not be processed
|
||||||
|
for scripture in chapter.find_all('scripRef'):
|
||||||
|
scripture.clear()
|
||||||
|
|
||||||
|
for doc in chapter.children:
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser.feed(str(doc))
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
logger.debug(chapter.attrs)
|
||||||
|
raise
|
||||||
|
|
||||||
|
parser.clean_file()
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
# if pbar.last_print_n > 2:
|
||||||
|
# sys.exit()
|
||||||
|
|
||||||
|
pbar.close()
|
||||||
|
parser.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user