import argparse import json import logging import os import requests from bs4 import BeautifulSoup, NavigableString logging.basicConfig() logger = logging.getLogger() URL_BASE = 'https://nimi.li/' URL_WORD = URL_BASE + '{word}' URL_WORD_2 = 'https://linku.la/words/{word}' def setup_logging_level(debug=False): log_level = logging.DEBUG if debug else logging.INFO logger.setLevel(log_level) logger.debug("Debugging enabled") def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('query', nargs='*', default="", help="freeform") parser.add_argument('--debug', dest='debug', action='store_true') parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true') return parser.parse_args() args = parse_args() setup_logging_level(args.debug) DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') WORDS_DIR = os.path.join(DATA_DIR, 'words') def get_cache_or_url(url, cache_file): if os.path.exists(cache_file): logger.debug("Getting cached response from {}".format(cache_file)) with open(cache_file, 'r') as f: response_text = f.read() else: os.makedirs(os.path.dirname(cache_file), exist_ok=True) logger.debug("Getting response from {}".format(url)) response = requests.get(url) if response.status_code != 200: if args.raise_exceptions: raise Exception("Error getting response from {}".format(url)) logger.error("Error getting response from {}".format(url)) response_text = response.text with open(cache_file, 'w+') as f: f.write(response.text) return response_text def get_word_list(): words_file = os.path.join(DATA_DIR, 'words.json') if os.path.exists(words_file): logger.debug("Getting cached response from {}".format(words_file)) with open(words_file, 'r') as f: words = json.load(f) else: words = set() cache_file = os.path.join(DATA_DIR, 'all_words.html') response_text = get_cache_or_url(URL_BASE, cache_file) soup = BeautifulSoup(response_text, 'html.parser') word_tags = soup.select('main > div.grid p.font-pona') for word_tag in word_tags: if word_tag.text[-1] in ['0', '1', '2', '3']: final_word = word_tag.text[:-1] else: final_word = word_tag.text words.add(final_word) with open(words_file, 'w+') as f: json.dump(words, f, ensure_ascii=False, indent=2) return words def get_word_data(word, url): cache_file = os.path.join(WORDS_DIR, f'{word}.html') response_text = get_cache_or_url(url, cache_file) soup = BeautifulSoup(response_text, 'html.parser') tag = soup.select('body > div > div.px-4.sm\:px-8.lg\:px-16.m-auto.max-w-screen-xl > main > div.grid.sm\:grid-cols-2.mt-6.gap-6 > div:nth-child(1) > div') pu_defs = [t.text for t in tag] pu_definitions = [] for pu_def in pu_defs: pu_def_split = pu_def.split() pos = pu_def_split[0] definition = ' '.join(pu_def_split[1:]) pu_definitions.append({ "pos": pos, "definition": definition, }) # TODO: doesn't work tag_ku = soup.select('body > div > div > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(2) > span') ku_defs = [t.text for t in tag_ku] return { 'pu_definitions': pu_definitions } def get_words_data(word_list): words_data_file = os.path.join(DATA_DIR, 'words_data.json') if os.path.exists(words_data_file): logger.debug("Getting cached response from {}".format(words_data_file)) with open(words_data_file, 'r') as f: words_data = json.load(f) else: words_data = {} for word in word_list: url = URL_WORD_2.format(word=word) word_data = get_word_data(word, url) words_data[word] = word_data with open(words_data_file, 'w+') as f: json.dump(words_data, f, ensure_ascii=False, indent=2) return words_data def main(): for folder in [WORDS_DIR]: os.makedirs(folder, exist_ok=True) word_list = get_word_list() print(word_list) words_data = get_words_data(word_list) nouns = [] for word, data in words_data.items(): for pu_def in data.get('pu_definitions'): if pu_def.get('pos') == 'ADJECTIVE': nouns.append(word) print('" | "'.join(nouns)) if __name__ == "__main__": main()