toki_pona/scrape/app.py

138 lines
4.5 KiB
Python
Raw Permalink Normal View History

2024-04-10 07:27:09 +02:00
import argparse
import json
import logging
import os
import requests
from bs4 import BeautifulSoup, NavigableString
logging.basicConfig()
logger = logging.getLogger()
URL_BASE = 'https://nimi.li/'
URL_WORD = URL_BASE + '{word}'
URL_WORD_2 = 'https://linku.la/words/{word}'
def setup_logging_level(debug=False):
log_level = logging.DEBUG if debug else logging.INFO
logger.setLevel(log_level)
logger.debug("Debugging enabled")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('query', nargs='*', default="", help="freeform")
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true')
return parser.parse_args()
args = parse_args()
setup_logging_level(args.debug)
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
WORDS_DIR = os.path.join(DATA_DIR, 'words')
def get_cache_or_url(url, cache_file):
if os.path.exists(cache_file):
logger.debug("Getting cached response from {}".format(cache_file))
with open(cache_file, 'r') as f:
response_text = f.read()
else:
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
logger.debug("Getting response from {}".format(url))
response = requests.get(url)
if response.status_code != 200:
if args.raise_exceptions:
raise Exception("Error getting response from {}".format(url))
logger.error("Error getting response from {}".format(url))
response_text = response.text
with open(cache_file, 'w+') as f:
f.write(response.text)
return response_text
def get_word_list():
words_file = os.path.join(DATA_DIR, 'words.json')
if os.path.exists(words_file):
logger.debug("Getting cached response from {}".format(words_file))
with open(words_file, 'r') as f:
words = json.load(f)
else:
words = set()
cache_file = os.path.join(DATA_DIR, 'all_words.html')
response_text = get_cache_or_url(URL_BASE, cache_file)
soup = BeautifulSoup(response_text, 'html.parser')
word_tags = soup.select('main > div.grid p.font-pona')
for word_tag in word_tags:
if word_tag.text[-1] in ['0', '1', '2', '3']:
final_word = word_tag.text[:-1]
else:
final_word = word_tag.text
words.add(final_word)
with open(words_file, 'w+') as f:
json.dump(words, f, ensure_ascii=False, indent=2)
return words
def get_word_data(word, url):
cache_file = os.path.join(WORDS_DIR, f'{word}.html')
response_text = get_cache_or_url(url, cache_file)
soup = BeautifulSoup(response_text, 'html.parser')
2024-06-17 12:17:29 +02:00
tag = soup.select('body > div > div.px-4.sm\:px-8.lg\:px-16.m-auto.max-w-screen-xl > main > div.grid.sm\:grid-cols-2.mt-6.gap-6 > div:nth-child(1) > div')
2024-04-10 07:27:09 +02:00
pu_defs = [t.text for t in tag]
pu_definitions = []
for pu_def in pu_defs:
pu_def_split = pu_def.split()
pos = pu_def_split[0]
definition = ' '.join(pu_def_split[1:])
pu_definitions.append({
"pos": pos,
"definition": definition,
})
2024-06-17 12:17:29 +02:00
# TODO: doesn't work
tag_ku = soup.select('body > div > div > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(2) > span')
ku_defs = [t.text for t in tag_ku]
2024-04-10 07:27:09 +02:00
return {
'pu_definitions': pu_definitions
}
def get_words_data(word_list):
words_data_file = os.path.join(DATA_DIR, 'words_data.json')
if os.path.exists(words_data_file):
logger.debug("Getting cached response from {}".format(words_data_file))
with open(words_data_file, 'r') as f:
words_data = json.load(f)
else:
words_data = {}
for word in word_list:
url = URL_WORD_2.format(word=word)
word_data = get_word_data(word, url)
words_data[word] = word_data
with open(words_data_file, 'w+') as f:
json.dump(words_data, f, ensure_ascii=False, indent=2)
return words_data
def main():
for folder in [WORDS_DIR]:
os.makedirs(folder, exist_ok=True)
word_list = get_word_list()
print(word_list)
words_data = get_words_data(word_list)
nouns = []
for word, data in words_data.items():
for pu_def in data.get('pu_definitions'):
if pu_def.get('pos') == 'ADJECTIVE':
nouns.append(word)
print('" | "'.join(nouns))
if __name__ == "__main__":
main()