added grammar

This commit is contained in:
Dani Tsvetkov 2024-04-10 07:32:10 +02:00
parent 3b25d0c3d5
commit 411f26bb7b
1 changed files with 60 additions and 0 deletions

60
scrape/grammar_parser.py Normal file
View File

@ -0,0 +1,60 @@
from lark import Lark, tree
sentence = 'tenpo lon la mi sitelen e lipu sona'
# from https://github.com/kilipan/nasin-toki
grammar = """
sentence: ((phrase|PREPOSITION phrase|sentence) LA)* (PRONOUN_MI_SINA | phrase (EN phrase)* LI)+ (PREVERB* phrase (E phrase)* (PREPOSITION phrase)*)*
phrase: head PI? modifier*
head: CONTENT_WORD | loan_word
modifier: CONTENT_WORD | loan_word
EN: "en"
LI: "li"
E: "e"
LA: "la"
PI: "pi"
O: "o"
ANU: "anu"
PARTICLE: EN | LI | E | LA | PI | O | ANU
A: "a"
EMOTICLE: A
CONTENT_WORD: "akesi" | "ala" | "alasa" | "ale" | "anpa" | "ante" | "awen" | "esun" | "ijo" | "ike" | "ilo" | "insa" | "jaki" | "jaki" | "jan" | "jelo" | "jo" | "kala" | "kalama" | "kama" | "kasi" | "ken" | "kepeken" | "kili" | "kiwen" | "ko" | "kon" | "kule" | "kulupu" | "kute" | "lape" | "laso" | "lawa" | "len" | "lete" | "lili" | "linja" | "lipu" | "loje" | "lon" | "luka" | "lukin" | "lupa" | "ma" | "mama" | "mani" | "mi" | "moku" | "moli" | "monsi" | "mu" | "mun" | "musi" | "mute" | "nanpa" | "nasa" | "nasin" | "nena" | "nimi" | "noka" | "olin" | "ona" | "open" | "pakala" | "pali" | "palisa" | "pan" | "pana" | "pilin" | "pimeja" | "pini" | "pipi" | "poka" | "poki" | "pona" | "pu" | "sama" | "seli" | "selo" | "seme" | "sewi" | "sijelo" | "sike" | "sin" | "sina" | "sinpin" | "sitelen" | "sona" | "soweli" | "suli" | "suno" | "supa" | "suwi" | "tan" | "taso" | "tawa" | "telo" | "tenpo" | "toki" | "tomo" | "tu" | "unpa" | "uta" | "utala" | "walo" | "wan" | "waso" | "wawa" | "weka" | "wile" | "kijetesantakalu" | "kin" | "kipisi" | "ku" | "leko" | "meli" | "mije" | "misikeke" | "monsuta" | "namako" | "soko" | "tonsi"
PRONOUN_MI_SINA: "mi" | "sina"
PRONOUN: PRONOUN_MI_SINA | "ona" | "ni"
PREPOSITION: "lon" | "tawa" | "tan" | "sama" | "kepeken"
PREVERB: "wile" | "sona" | "awen" | "kama" | "ken" | "lukin"
QUESTION: "seme"
VOWEL: "a" | "e" | "i" | "o" | "u"
CONSONANT: "p" | "t" | "k" | "s" | "m" | "n" | "l" | "j" | "w"
CONSONANT_N: "n"
VOWEL_CAPITAL: "A" | "E" | "I" | "O" | "U"
CONSONANT_CAPITAL: "P" | "T" | "K" | "S" | "M" | "N" | "L" | "J" | "W"
SYLLABLE: CONSONANT? VOWEL CONSONANT_N?
SYLLABLE_CAPITAL_V: VOWEL_CAPITAL CONSONANT_N?
SYLLABLE_CAPITAL_CV: CONSONANT_CAPITAL VOWEL CONSONANT_N?
loan_word: (SYLLABLE_CAPITAL_V | SYLLABLE_CAPITAL_CV) SYLLABLE*
%import common.WS
%ignore WS
"""
parser = Lark(grammar, start='sentence', ambiguity='explicit')
print(parser.parse(sentence))
print(parser.parse(sentence).pretty())