toki_pona/linku_api/lipamanka_data/parse_lipamanka.py

28 lines
853 B
Python
Raw Permalink Normal View History

2024-06-17 12:17:29 +02:00
import json
from bs4 import BeautifulSoup
def create_dictionary(html_file):
"""Parses the HTML file and creates a dictionary of word definitions."""
with open(html_file, 'r') as f:
soup = BeautifulSoup(f, 'html.parser')
word_definitions = {}
current_word = None
for element in soup.find(id="the-dictionary").next_siblings:
if element.name == 'h3':
current_word = element.text.strip()
word_definitions[current_word] = ""
elif element.name == 'p' and current_word:
word_definitions[current_word] += element.text.strip() + " "
return word_definitions
# Example usage:
html_file_path = "linku_api/lipamanka_data/page.html"
result_dict = create_dictionary(html_file_path)
with open("linku_api/lipamanka_data.json", "w+") as f:
json.dump(result_dict, f, indent=2)