mh_parser/vol_splitter.py

53 lines
1.5 KiB
Python
Raw Permalink Normal View History

2023-12-19 11:02:07 +00:00
from pathlib import Path
from bs4 import BeautifulSoup
import sys
def roman_to_int(number: str) -> int:
if number is None:
return 0
# Sometimes the roman numeral comes with a '.'. Striping to not cause
# issues
number = number.strip('.')
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
total = 0
for i in range(len(number) - 1, -1, -1):
num = roman[number[i]]
if 3 * num < total:
total -= num
else:
total += num
return total
folder = Path('./vol_split').absolute()
book_num = 1
for vol in sorted(Path('./').glob('*.xml')):
print(vol)
soup = BeautifulSoup(vol.read_text(), 'xml')
for book in soup.find_all('div1'):
book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3')
if any(i in book_name for i in ['Title', 'Preface', 'Indexes']):
continue
book_name = f'{book_num} - {book_name}'
print('\t', book_name)
for chapter in book.find_all('div2'):
if 'CHAPTER' in chapter['title'].upper():
_, roman_num = chapter['title'].split(' ')
chapter_num = roman_to_int(roman_num)
chapter_title = f'Chapter {chapter_num}'
else:
chapter_title = f'0 - {chapter['title']}'
chapter_file = folder/ book_name / f'{chapter_title}.xml'
chapter_file.parent.mkdir(parents=True, exist_ok=True)
chapter_file.write_text(str(chapter))
book_num += 1