from pathlib import Path from bs4 import BeautifulSoup import sys def roman_to_int(number: str) -> int: if number is None: return 0 # Sometimes the roman numeral comes with a '.'. Striping to not cause # issues number = number.strip('.') roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} total = 0 for i in range(len(number) - 1, -1, -1): num = roman[number[i]] if 3 * num < total: total -= num else: total += num return total folder = Path('./vol_split').absolute() book_num = 1 for vol in sorted(Path('./').glob('*.xml')): print(vol) soup = BeautifulSoup(vol.read_text(), 'xml') for book in soup.find_all('div1'): book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3') if any(i in book_name for i in ['Title', 'Preface', 'Indexes']): continue book_name = f'{book_num} - {book_name}' print('\t', book_name) for chapter in book.find_all('div2'): if 'CHAPTER' in chapter['title'].upper(): _, roman_num = chapter['title'].split(' ') chapter_num = roman_to_int(roman_num) chapter_title = f'Chapter {chapter_num}' else: chapter_title = f'0 - {chapter['title']}' chapter_file = folder/ book_name / f'{chapter_title}.xml' chapter_file.parent.mkdir(parents=True, exist_ok=True) chapter_file.write_text(str(chapter)) book_num += 1