general solution based tokenizer

This commit is contained in:
Daniel Tsvetkov 2020-02-11 11:30:11 +01:00
parent 559bd39e9a
commit abbba47bb5
2 changed files with 152 additions and 49 deletions

View File

@ -1,10 +1,12 @@
import json
import re
import sys
from datetime import timedelta, datetime
from pprint import pprint, pformat
from datetime import datetime
from pygments import highlight, lexers, formatters
from tww import ISO_FORMAT, time_to_emoji
from tww import resolve_timezone, dateparser_parse_dt, get_utcnow, get_s_since_epoch, get_ms_since_epoch, \
dt_tz_translation, DEFAULT_FORMAT, get_local_now
dt_tz_translation, DEFAULT_FORMAT, get_local_now, query_to_format_result
r_generic = re.compile('(.*)', flags=re.IGNORECASE)
r_time_in_epoch_s_now = re.compile('(?:time since epoch|seconds since epoch)', flags=re.IGNORECASE)
@ -13,10 +15,13 @@ r_time_in_epoch_s3 = re.compile('(?:seconds)?\s*since\s*(.*)', flags=re.IGNORECA
r_time_in_epoch_ms_now = re.compile('(?:milliseconds since epoch)', flags=re.IGNORECASE)
r_time_in_epoch_ms2 = re.compile('(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)', flags=re.IGNORECASE)
r_time_in_epoch_ms3 = re.compile('(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_in = re.compile('(?:time)?\s*in\s*(.*)', flags=re.IGNORECASE)
r_time_since = re.compile('(?:time)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_until = re.compile('(?:time)?\s*until\s*(.*)', flags=re.IGNORECASE)
r_time_between = re.compile('(?:time)?\s*between\s*(.*)\s*and\s*(.*)', flags=re.IGNORECASE)
r_timezone_translation = re.compile('(.*)?\s(?:in|to)\s(.*)', flags=re.IGNORECASE)
r_timezone = re.compile('(.*)?\s(?:timezone|timezones|tz)', flags=re.IGNORECASE)
r_timezone_2 = re.compile('(?:timezone in|timezones in|tz in|timezone|timezones|tz)\s(.*)?', flags=re.IGNORECASE)
def handler_time_s(dt_s: str) -> int:
@ -74,23 +79,44 @@ def handler_timezone_translation(dt_s: str, timezone_like_s: str) -> dict:
def handler_generic_parser(dt_s: str) -> datetime:
return query_to_format_result(dt_s, None)
def handler_dateparser(dt_s: str) -> datetime:
return dateparser_parse_dt(dt_s)
def handler_time_in_parser(dt_s: str) -> datetime:
return query_to_format_result("now to {}".format(dt_s))
def handler_timezone(timezone_s: str):
return resolve_timezone(timezone_s)
QUERY_TYPE_DT_TR = "datetime_translation"
QUERY_TYPE_DT = "datetime_details"
QUERY_TYPE_TZ = "timezone"
QUERY_TYPE_TD = "timedelta"
regex_handlers = [
(r_time_in_epoch_s_now, handler_time_s_now_local),
(r_time_in_epoch_s_now, handler_time_s_now_utc),
(r_time_in_epoch_s2, handler_time_s),
(r_time_in_epoch_s3, handler_time_s),
(r_time_in_epoch_ms_now, handler_time_ms_now_local),
(r_time_in_epoch_ms_now, handler_time_ms_now_utc),
(r_time_in_epoch_ms2, handler_time_ms),
(r_time_in_epoch_ms3, handler_time_ms),
(r_timezone_translation, handler_timezone_translation),
(r_time_since, handler_time_since),
(r_time_until, handler_time_until),
(r_time_between, handler_time_diff),
(r_generic, handler_generic_parser),
(r_time_in_epoch_s_now, handler_time_s_now_local, QUERY_TYPE_DT),
(r_time_in_epoch_s_now, handler_time_s_now_utc, QUERY_TYPE_DT),
(r_time_in_epoch_s2, handler_time_s, QUERY_TYPE_DT),
(r_time_in_epoch_s3, handler_time_s, QUERY_TYPE_DT),
(r_time_in_epoch_ms_now, handler_time_ms_now_local, QUERY_TYPE_DT),
(r_time_in_epoch_ms_now, handler_time_ms_now_utc, QUERY_TYPE_DT),
(r_time_in_epoch_ms2, handler_time_ms, QUERY_TYPE_DT),
(r_time_in_epoch_ms3, handler_time_ms, QUERY_TYPE_DT),
(r_timezone_translation, handler_timezone_translation, QUERY_TYPE_DT_TR),
(r_time_since, handler_time_since, QUERY_TYPE_TD),
(r_time_until, handler_time_until, QUERY_TYPE_TD),
(r_time_between, handler_time_diff, QUERY_TYPE_TD),
(r_time_in, handler_time_in_parser, QUERY_TYPE_DT),
(r_timezone, handler_timezone, QUERY_TYPE_TZ),
(r_timezone_2, handler_timezone, QUERY_TYPE_TZ),
(r_generic, handler_dateparser, QUERY_TYPE_DT),
(r_generic, handler_generic_parser, QUERY_TYPE_DT),
]
@ -104,16 +130,17 @@ def try_regex(r, s):
return groups
def parse(s):
def tokenize(s):
solutions = []
for r, h in regex_handlers:
for r, h, t in regex_handlers:
g = try_regex(r, s)
if g is not None:
try:
result = h(*g)
except Exception as e:
result = None
solutions.append((h.__name__, result))
continue
if result is not None:
solutions.append((h.__name__, (result, ), t))
return solutions
@ -139,20 +166,71 @@ def test():
"now in dublin",
]
for s in test_strings:
print("{} -> {}".format(s, parse(s)))
print("{} -> {}".format(s, tokenize(s)))
def pretty_print_dict(obj):
formatted_json = json.dumps(obj, indent=2)
colorful_json = highlight(formatted_json, lexers.JsonLexer(), formatters.TerminalFormatter())
print(colorful_json)
def dt_pretty(dt):
rv = {}
rv["iso8601_full"] = dt.strftime(ISO_FORMAT)
rv["iso8601_date"] = dt.strftime('%Y-%m-%d')
rv["iso8601_time"] = dt.strftime('%H:%M:%S')
rv["locale_dt"] = dt.strftime("%c")
rv["locale_day_of_week"] = dt.strftime("%A")
rv["locale_day_of_week_short"] = dt.strftime("%a")
rv["day_of_week_number"] = dt.strftime("%w")
rv["locale_month"] = dt.strftime("%B")
rv["locale_month_short"] = dt.strftime("%b")
rv["tz_name"] = dt.strftime("%Z")
rv["tz_offset"] = dt.strftime("%z")
rv["hh:mm"] = dt.strftime("%H:%M")
rv["locale_time"] = dt.strftime("%X")
rv["locale_date"] = dt.strftime("%x")
rv["emoji_time"] = time_to_emoji(dt)
rv["unix_s"] = get_s_since_epoch(dt)
rv["unix_ms"] = get_ms_since_epoch(dt)
return rv
def resolve_query_type(query):
solutions = tokenize(query)
if not solutions:
dt = get_local_now()
return [["now", (dt,), QUERY_TYPE_DT]]
return solutions
def resolve_query(query):
rv = {
"query": query,
"solutions": [],
}
solutions = resolve_query_type(query)
for solution in solutions:
element = {}
handler, results, query_type = solution
element["handler"] = handler
element["query_type"] = query_type
if query_type == QUERY_TYPE_DT:
element["dt"] = dt_pretty(results[0])
elif query_type == QUERY_TYPE_DT_TR:
element["src_dt"] = dt_pretty(results[0])
element["dst_dt"] = dt_pretty(results[1])
elif query_type == QUERY_TYPE_TZ:
element["tz"] = results[0]
elif query_type == QUERY_TYPE_TD:
element["timedelta"] = results[0]
rv["solutions"].append(element)
return rv
if __name__ == "__main__":
query = "2020-02-07 11:25:58+0000 in seconds" #' '.join(sys.argv[1:])
results = parse(query)
for handler, result in results:
if type(result) is datetime:
print(" {} -> {}".format(handler, result.strftime(DEFAULT_FORMAT)))
elif type(result) is timedelta:
print(" {} -> {}".format(handler, result))
elif type(result) is dict:
print(" {} -> {}".format(handler, pformat(result)))
elif type(result) is None:
print(" {} -> Couldn't solve query".format(handler))
else:
print(" {} -> {}".format(handler, result))
query = "now in sofia"
# query = ' '.join(sys.argv[1:])
result = resolve_query(query)
pretty_print_dict(result)

View File

@ -213,7 +213,7 @@ def parse_query(query):
"""
Parses the user query to the datetime, tz/loc parts
"""
query = ' '.join(query)
# query = ' '.join(query)
query = query.strip()
if not query:
logger.critical("Use a query like <datetime-like> ['to' <timezone or location>]")
@ -246,6 +246,32 @@ def serialize_location(location):
}
def find_from_offset(query):
for universal_alias in ["gmt", "utc", "+", "-"]:
if query.startswith(universal_alias):
splitted_query = query.split(universal_alias)
if len(splitted_query) != 2:
continue
offset = splitted_query[1]
if ':' not in offset:
try:
hhs, mms = offset, "00"
except Exception:
continue
else:
splitted_offset = offset.split(':')
if len(splitted_offset) != 2:
continue
hhs, mms = splitted_offset
try:
if universal_alias in ["+", "-"]:
return tzinfo_from_offset("{}{:02d}{:02d}".format(universal_alias, int(hhs), int(mms)))
return tzinfo_from_offset("+{:02d}{:02d}".format(int(hhs), int(mms)))
except Exception:
continue
return None, []
def resolve_timezone(query):
if not query:
query = "utc"
@ -254,10 +280,7 @@ def resolve_timezone(query):
normal_query = query.lower().strip()
found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "")
found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set()))
found_from_offset_tz = None
try:
found_from_offset_tz = tzinfo_from_offset(normal_query)
except: ...
found_from_offset_tz, offset_tzs = find_from_offset(normal_query)
normal_tz = found_from_iana_tz
if not normal_tz:
if found_from_abbr_tzs:
@ -304,7 +327,7 @@ def resolve_timezone(query):
"normal_query": normal_query,
"found_from_iana_tz": found_from_iana_tz,
"found_from_abbr_tzs": found_from_abbr_tzs,
"found_from_offset_tzs": found_from_offset_tz,
"found_from_offset_tzs": offset_tzs,
"local_location": local_location,
"remote_location": remote_location,
"search_pytz": normal_tz,
@ -335,8 +358,7 @@ def solve_query(human_dt, human_tz_loc):
def format_result(result, fmt):
if result is None:
logger.critical("Could not solve query")
exit(1)
logger.error("Could not solve query")
logger.debug("Format: {}".format(fmt))
format_result = result.strftime(fmt)
logger.debug("Formated result: {} -> {}".format(result, format_result))
@ -346,8 +368,9 @@ def format_result(result, fmt):
def query_to_format_result(query, fmt=DEFAULT_FORMAT):
human_dt, human_tz_loc = parse_query(query)
result = solve_query(human_dt, human_tz_loc)
formated_result = format_result(result, fmt)
return formated_result
if fmt:
return format_result(result, fmt)
return result
def main(args):
@ -442,8 +465,10 @@ def tzinfo_from_offset(offset: str) -> pytz.timezone:
tznames = TZ_OFFSETS.get(offset, [])
for tzname in tznames:
if tzname.startswith('Etc/GMT'):
return pytz.timezone(tzname)
return pytz.timezone(tznames[0])
return pytz.timezone(tzname), tznames
if tznames:
return pytz.timezone(tznames[0]), tznames
return None, []
def dateparser_parse_dt(s: str):
@ -467,7 +492,7 @@ def get_utcnow(tzaware: bool = True):
def get_local_now(tzaware: bool = True):
if tzaware:
return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset()))
return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset())[0])
return datetime.utcnow()