53 lines
1.5 KiB
Python
53 lines
1.5 KiB
Python
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
import sys
|
|
|
|
|
|
def roman_to_int(number: str) -> int:
|
|
if number is None:
|
|
return 0
|
|
|
|
# Sometimes the roman numeral comes with a '.'. Striping to not cause
|
|
# issues
|
|
number = number.strip('.')
|
|
|
|
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
|
total = 0
|
|
for i in range(len(number) - 1, -1, -1):
|
|
num = roman[number[i]]
|
|
if 3 * num < total:
|
|
total -= num
|
|
else:
|
|
total += num
|
|
|
|
return total
|
|
|
|
folder = Path('./vol_split').absolute()
|
|
|
|
book_num = 1
|
|
for vol in sorted(Path('./').glob('*.xml')):
|
|
print(vol)
|
|
soup = BeautifulSoup(vol.read_text(), 'xml')
|
|
for book in soup.find_all('div1'):
|
|
book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3')
|
|
if any(i in book_name for i in ['Title', 'Preface', 'Indexes']):
|
|
continue
|
|
|
|
book_name = f'{book_num} - {book_name}'
|
|
|
|
print('\t', book_name)
|
|
for chapter in book.find_all('div2'):
|
|
if 'CHAPTER' in chapter['title'].upper():
|
|
_, roman_num = chapter['title'].split(' ')
|
|
chapter_num = roman_to_int(roman_num)
|
|
chapter_title = f'Chapter {chapter_num}'
|
|
else:
|
|
chapter_title = f'0 - {chapter['title']}'
|
|
|
|
chapter_file = folder/ book_name / f'{chapter_title}.xml'
|
|
chapter_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
chapter_file.write_text(str(chapter))
|
|
book_num += 1
|
|
|