2024-04-10 07:27:09 +02:00
|
|
|
import argparse
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
|
|
|
|
|
|
logging.basicConfig()
|
|
|
|
logger = logging.getLogger()
|
|
|
|
|
|
|
|
URL_BASE = 'https://nimi.li/'
|
|
|
|
URL_WORD = URL_BASE + '{word}'
|
|
|
|
URL_WORD_2 = 'https://linku.la/words/{word}'
|
|
|
|
|
|
|
|
|
|
|
|
def setup_logging_level(debug=False):
|
|
|
|
log_level = logging.DEBUG if debug else logging.INFO
|
|
|
|
logger.setLevel(log_level)
|
|
|
|
logger.debug("Debugging enabled")
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args():
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('query', nargs='*', default="", help="freeform")
|
|
|
|
parser.add_argument('--debug', dest='debug', action='store_true')
|
|
|
|
parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true')
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
args = parse_args()
|
|
|
|
setup_logging_level(args.debug)
|
|
|
|
|
|
|
|
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
|
|
|
WORDS_DIR = os.path.join(DATA_DIR, 'words')
|
|
|
|
|
|
|
|
|
|
|
|
def get_cache_or_url(url, cache_file):
|
|
|
|
if os.path.exists(cache_file):
|
|
|
|
logger.debug("Getting cached response from {}".format(cache_file))
|
|
|
|
with open(cache_file, 'r') as f:
|
|
|
|
response_text = f.read()
|
|
|
|
else:
|
|
|
|
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
|
|
|
logger.debug("Getting response from {}".format(url))
|
|
|
|
response = requests.get(url)
|
|
|
|
if response.status_code != 200:
|
|
|
|
if args.raise_exceptions:
|
|
|
|
raise Exception("Error getting response from {}".format(url))
|
|
|
|
logger.error("Error getting response from {}".format(url))
|
|
|
|
response_text = response.text
|
|
|
|
with open(cache_file, 'w+') as f:
|
|
|
|
f.write(response.text)
|
|
|
|
return response_text
|
|
|
|
|
|
|
|
def get_word_list():
|
|
|
|
words_file = os.path.join(DATA_DIR, 'words.json')
|
|
|
|
if os.path.exists(words_file):
|
|
|
|
logger.debug("Getting cached response from {}".format(words_file))
|
|
|
|
with open(words_file, 'r') as f:
|
|
|
|
words = json.load(f)
|
|
|
|
else:
|
|
|
|
words = set()
|
|
|
|
cache_file = os.path.join(DATA_DIR, 'all_words.html')
|
|
|
|
response_text = get_cache_or_url(URL_BASE, cache_file)
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response_text, 'html.parser')
|
|
|
|
|
|
|
|
word_tags = soup.select('main > div.grid p.font-pona')
|
|
|
|
for word_tag in word_tags:
|
|
|
|
if word_tag.text[-1] in ['0', '1', '2', '3']:
|
|
|
|
final_word = word_tag.text[:-1]
|
|
|
|
else:
|
|
|
|
final_word = word_tag.text
|
|
|
|
words.add(final_word)
|
|
|
|
with open(words_file, 'w+') as f:
|
|
|
|
json.dump(words, f, ensure_ascii=False, indent=2)
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
|
|
def get_word_data(word, url):
|
|
|
|
cache_file = os.path.join(WORDS_DIR, f'{word}.html')
|
|
|
|
response_text = get_cache_or_url(url, cache_file)
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response_text, 'html.parser')
|
|
|
|
|
2024-06-17 12:17:29 +02:00
|
|
|
tag = soup.select('body > div > div.px-4.sm\:px-8.lg\:px-16.m-auto.max-w-screen-xl > main > div.grid.sm\:grid-cols-2.mt-6.gap-6 > div:nth-child(1) > div')
|
2024-04-10 07:27:09 +02:00
|
|
|
pu_defs = [t.text for t in tag]
|
|
|
|
pu_definitions = []
|
|
|
|
for pu_def in pu_defs:
|
|
|
|
pu_def_split = pu_def.split()
|
|
|
|
pos = pu_def_split[0]
|
|
|
|
definition = ' '.join(pu_def_split[1:])
|
|
|
|
pu_definitions.append({
|
|
|
|
"pos": pos,
|
|
|
|
"definition": definition,
|
|
|
|
})
|
|
|
|
|
2024-06-17 12:17:29 +02:00
|
|
|
# TODO: doesn't work
|
|
|
|
tag_ku = soup.select('body > div > div > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(2) > span')
|
|
|
|
ku_defs = [t.text for t in tag_ku]
|
|
|
|
|
2024-04-10 07:27:09 +02:00
|
|
|
return {
|
|
|
|
'pu_definitions': pu_definitions
|
|
|
|
}
|
|
|
|
|
|
|
|
def get_words_data(word_list):
|
|
|
|
words_data_file = os.path.join(DATA_DIR, 'words_data.json')
|
|
|
|
if os.path.exists(words_data_file):
|
|
|
|
logger.debug("Getting cached response from {}".format(words_data_file))
|
|
|
|
with open(words_data_file, 'r') as f:
|
|
|
|
words_data = json.load(f)
|
|
|
|
else:
|
|
|
|
words_data = {}
|
|
|
|
for word in word_list:
|
|
|
|
url = URL_WORD_2.format(word=word)
|
|
|
|
word_data = get_word_data(word, url)
|
|
|
|
words_data[word] = word_data
|
|
|
|
with open(words_data_file, 'w+') as f:
|
|
|
|
json.dump(words_data, f, ensure_ascii=False, indent=2)
|
|
|
|
return words_data
|
|
|
|
|
|
|
|
def main():
|
|
|
|
for folder in [WORDS_DIR]:
|
|
|
|
os.makedirs(folder, exist_ok=True)
|
|
|
|
word_list = get_word_list()
|
|
|
|
print(word_list)
|
|
|
|
words_data = get_words_data(word_list)
|
|
|
|
nouns = []
|
|
|
|
for word, data in words_data.items():
|
|
|
|
for pu_def in data.get('pu_definitions'):
|
|
|
|
if pu_def.get('pos') == 'ADJECTIVE':
|
|
|
|
nouns.append(word)
|
|
|
|
print('" | "'.join(nouns))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|