toki_pona/scrape/app.py

import argparse
import json
import logging
import os

import requests
from bs4 import BeautifulSoup, NavigableString

logging.basicConfig()
logger = logging.getLogger()

URL_BASE = 'https://nimi.li/'
URL_WORD = URL_BASE + '{word}'
URL_WORD_2 = 'https://linku.la/words/{word}'


def setup_logging_level(debug=False):
    log_level = logging.DEBUG if debug else logging.INFO
    logger.setLevel(log_level)
    logger.debug("Debugging enabled")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('query', nargs='*', default="", help="freeform")
    parser.add_argument('--debug', dest='debug', action='store_true')
    parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true')
    return parser.parse_args()


args = parse_args()
setup_logging_level(args.debug)

DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
WORDS_DIR = os.path.join(DATA_DIR, 'words')


def get_cache_or_url(url, cache_file):
    if os.path.exists(cache_file):
        logger.debug("Getting cached response from {}".format(cache_file))
        with open(cache_file, 'r') as f:
            response_text = f.read()
    else:
        os.makedirs(os.path.dirname(cache_file), exist_ok=True)
        logger.debug("Getting response from {}".format(url))
        response = requests.get(url)
        if response.status_code != 200:
            if args.raise_exceptions:
                raise Exception("Error getting response from {}".format(url))
            logger.error("Error getting response from {}".format(url))
        response_text = response.text
        with open(cache_file, 'w+') as f:
            f.write(response.text)
    return response_text

def get_word_list():
    words_file = os.path.join(DATA_DIR, 'words.json')
    if os.path.exists(words_file):
        logger.debug("Getting cached response from {}".format(words_file))
        with open(words_file, 'r') as f:
            words = json.load(f)
    else:
        words = set()
        cache_file = os.path.join(DATA_DIR, 'all_words.html')
        response_text = get_cache_or_url(URL_BASE, cache_file)

        soup = BeautifulSoup(response_text, 'html.parser')

        word_tags = soup.select('main > div.grid p.font-pona')
        for word_tag in word_tags:
            if word_tag.text[-1] in ['0', '1', '2', '3']:
                final_word = word_tag.text[:-1]
            else:
                final_word = word_tag.text
            words.add(final_word)
        with open(words_file, 'w+') as f:
            json.dump(words, f, ensure_ascii=False, indent=2)
    return words


def get_word_data(word, url):
    cache_file = os.path.join(WORDS_DIR, f'{word}.html')
    response_text = get_cache_or_url(url, cache_file)

    soup = BeautifulSoup(response_text, 'html.parser')

    tag = soup.select('body > div > div.px-4.sm\:px-8.lg\:px-16.m-auto.max-w-screen-xl > main > div.grid.sm\:grid-cols-2.mt-6.gap-6 > div:nth-child(1) > div')
    pu_defs = [t.text for t in tag]
    pu_definitions = []
    for pu_def in pu_defs:
        pu_def_split = pu_def.split()
        pos = pu_def_split[0]
        definition = ' '.join(pu_def_split[1:])
        pu_definitions.append({
            "pos": pos,
            "definition": definition,
        })
    
    # TODO: doesn't work
    tag_ku = soup.select('body > div > div > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(2) > span')
    ku_defs = [t.text for t in tag_ku]
    
    return {
        'pu_definitions': pu_definitions
    }

def get_words_data(word_list):
    words_data_file = os.path.join(DATA_DIR, 'words_data.json')
    if os.path.exists(words_data_file):
        logger.debug("Getting cached response from {}".format(words_data_file))
        with open(words_data_file, 'r') as f:
            words_data = json.load(f)
    else:
        words_data = {}
        for word in word_list:
            url = URL_WORD_2.format(word=word)
            word_data = get_word_data(word, url)
            words_data[word] = word_data
        with open(words_data_file, 'w+') as f:
            json.dump(words_data, f, ensure_ascii=False, indent=2)
    return words_data

def main():
    for folder in [WORDS_DIR]:
        os.makedirs(folder, exist_ok=True)
    word_list = get_word_list()
    print(word_list)
    words_data = get_words_data(word_list)
    nouns = []
    for word, data in words_data.items():
        for pu_def in data.get('pu_definitions'):
            if pu_def.get('pos') == 'ADJECTIVE':
                nouns.append(word)
    print('" | "'.join(nouns))


if __name__ == "__main__":
    main()
scrape nimi li and basic data 2024-04-10 07:27:09 +02:00			`import argparse`
			`import json`
			`import logging`
			`import os`

			`import requests`
			`from bs4 import BeautifulSoup, NavigableString`

			`logging.basicConfig()`
			`logger = logging.getLogger()`

			`URL_BASE = 'https://nimi.li/'`
			`URL_WORD = URL_BASE + '{word}'`
			`URL_WORD_2 = 'https://linku.la/words/{word}'`


			`def setup_logging_level(debug=False):`
			`log_level = logging.DEBUG if debug else logging.INFO`
			`logger.setLevel(log_level)`
			`logger.debug("Debugging enabled")`


			`def parse_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('query', nargs='*', default="", help="freeform")`
			`parser.add_argument('--debug', dest='debug', action='store_true')`
			`parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true')`
			`return parser.parse_args()`


			`args = parse_args()`
			`setup_logging_level(args.debug)`

			`DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')`
			`WORDS_DIR = os.path.join(DATA_DIR, 'words')`


			`def get_cache_or_url(url, cache_file):`
			`if os.path.exists(cache_file):`
			`logger.debug("Getting cached response from {}".format(cache_file))`
			`with open(cache_file, 'r') as f:`
			`response_text = f.read()`
			`else:`
			`os.makedirs(os.path.dirname(cache_file), exist_ok=True)`
			`logger.debug("Getting response from {}".format(url))`
			`response = requests.get(url)`
			`if response.status_code != 200:`
			`if args.raise_exceptions:`
			`raise Exception("Error getting response from {}".format(url))`
			`logger.error("Error getting response from {}".format(url))`
			`response_text = response.text`
			`with open(cache_file, 'w+') as f:`
			`f.write(response.text)`
			`return response_text`

			`def get_word_list():`
			`words_file = os.path.join(DATA_DIR, 'words.json')`
			`if os.path.exists(words_file):`
			`logger.debug("Getting cached response from {}".format(words_file))`
			`with open(words_file, 'r') as f:`
			`words = json.load(f)`
			`else:`
			`words = set()`
			`cache_file = os.path.join(DATA_DIR, 'all_words.html')`
			`response_text = get_cache_or_url(URL_BASE, cache_file)`

			`soup = BeautifulSoup(response_text, 'html.parser')`

			`word_tags = soup.select('main > div.grid p.font-pona')`
			`for word_tag in word_tags:`
			`if word_tag.text[-1] in ['0', '1', '2', '3']:`
			`final_word = word_tag.text[:-1]`
			`else:`
			`final_word = word_tag.text`
			`words.add(final_word)`
			`with open(words_file, 'w+') as f:`
			`json.dump(words, f, ensure_ascii=False, indent=2)`
			`return words`


			`def get_word_data(word, url):`
			`cache_file = os.path.join(WORDS_DIR, f'{word}.html')`
			`response_text = get_cache_or_url(url, cache_file)`

			`soup = BeautifulSoup(response_text, 'html.parser')`

updates with lipamanka and words def 2024-06-17 12:17:29 +02:00			`tag = soup.select('body > div > div.px-4.sm\:px-8.lg\:px-16.m-auto.max-w-screen-xl > main > div.grid.sm\:grid-cols-2.mt-6.gap-6 > div:nth-child(1) > div')`
scrape nimi li and basic data 2024-04-10 07:27:09 +02:00			`pu_defs = [t.text for t in tag]`
			`pu_definitions = []`
			`for pu_def in pu_defs:`
			`pu_def_split = pu_def.split()`
			`pos = pu_def_split[0]`
			`definition = ' '.join(pu_def_split[1:])`
			`pu_definitions.append({`
			`"pos": pos,`
			`"definition": definition,`
			`})`

updates with lipamanka and words def 2024-06-17 12:17:29 +02:00			`# TODO: doesn't work`
			`tag_ku = soup.select('body > div > div > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(2) > span')`
			`ku_defs = [t.text for t in tag_ku]`

scrape nimi li and basic data 2024-04-10 07:27:09 +02:00			`return {`
			`'pu_definitions': pu_definitions`
			`}`

			`def get_words_data(word_list):`
			`words_data_file = os.path.join(DATA_DIR, 'words_data.json')`
			`if os.path.exists(words_data_file):`
			`logger.debug("Getting cached response from {}".format(words_data_file))`
			`with open(words_data_file, 'r') as f:`
			`words_data = json.load(f)`
			`else:`
			`words_data = {}`
			`for word in word_list:`
			`url = URL_WORD_2.format(word=word)`
			`word_data = get_word_data(word, url)`
			`words_data[word] = word_data`
			`with open(words_data_file, 'w+') as f:`
			`json.dump(words_data, f, ensure_ascii=False, indent=2)`
			`return words_data`

			`def main():`
			`for folder in [WORDS_DIR]:`
			`os.makedirs(folder, exist_ok=True)`
			`word_list = get_word_list()`
			`print(word_list)`
			`words_data = get_words_data(word_list)`
			`nouns = []`
			`for word, data in words_data.items():`
			`for pu_def in data.get('pu_definitions'):`
			`if pu_def.get('pos') == 'ADJECTIVE':`
			`nouns.append(word)`
			`print('" \| "'.join(nouns))`


			`if __name__ == "__main__":`
			`main()`