From 9f1790c2015c45c55c6f03adaf5873c3980205fd Mon Sep 17 00:00:00 2001 From: Daniel Tsvetkov Date: Tue, 3 Aug 2021 10:59:17 +0200 Subject: [PATCH] updated tokenizer --- tww/main.py | 20 ++++++++++ tww/tokenizer.py | 96 +++++++++++++++++++++++++++--------------------- 2 files changed, 74 insertions(+), 42 deletions(-) create mode 100644 tww/main.py diff --git a/tww/main.py b/tww/main.py new file mode 100644 index 0000000..ab8bc99 --- /dev/null +++ b/tww/main.py @@ -0,0 +1,20 @@ +E_NO_SUCH_TIME = "Time does not exist" + +# https://dateparser.readthedocs.io/en/latest/settings.html#settings +PARSER_TYPE_TIMESTAMP = "TIMESTAMP" +PARSER_TYPE_ABSOLUTE_TIME = "ABSOLUTE_TIME" +PARSER_TYPE_RELATIVE_TIME = "RELATIVE_TIME" + +PARSER_TYPE_TIMEZONE_TRANSLATION = "TIMEZONE_TRANSLATION" +PARSER_TYPE_TIME_DELTA = "TIME_DELTA" +PARSER_TYPE_TIMEZONE_DIFFERENCE = "TIMEZONE_DIFFERENCE" +PARSER_TYPE_TIME_IN_LOCATION = "TIME_IN_LOCATION" +PARSER_TYPE_TIME_DIFFERENCE = "TIME_DIFFERENCE" +PARSER_TYPE_WORKDAYS = "WORKDAYS" +PARSER_TYPE_DAY_OF_WEEK = "DAY_OF_WEEK" +PARSER_TYPE_TIMEZONE = "TIMEZONE" +PARSER_TYPE_CALENDAR = "CALENDAR" + + +def parser(type, language, output_fmt): + return None \ No newline at end of file diff --git a/tww/tokenizer.py b/tww/tokenizer.py index 9d84b2e..e3ee433 100644 --- a/tww/tokenizer.py +++ b/tww/tokenizer.py @@ -49,6 +49,8 @@ r_timezone = re.compile('(.*)\s(?:timezone|timezones|tz)', flags=re.IGNORECASE) r_calendar_year = re.compile('(?:cal year|calendar year)\s*(.*)', flags=re.IGNORECASE) r_calendar_month = re.compile('(?:calendar|cal|month|cal month|calendar month)\s*(.*)', flags=re.IGNORECASE) r_timezone_2 = re.compile('(?:timezone in|timezones in|tz in|timezone|timezones|tz)\s(.*)?', flags=re.IGNORECASE) +r_timezone_cities = re.compile('(?:cities with timezones|countries with timezones|cities with tz|countries with tz)\s(.*)?', flags=re.IGNORECASE) +r_weeknum = re.compile('(?:week number|week num|weeknum|week)\s(.*)?', flags=re.IGNORECASE) def handler_time_now_local(): @@ -213,7 +215,10 @@ QUERY_TYPE_CAL = "calendar" h_default = '' h_unix_s = 'dt->unix_s' h_unix_ms = 'dt->unix_ms' +h_week_num = 'dt->week_number' h_tz_offset = 'tz->tz_offset' +h_tz_iana = 'tz->found_from_offset_tzs' +h_tz_abbr = 'tz->found_from_abbr_tzs' h_time_in = 'dt->hh:mm' h_translation = 'dt->iso8601_full' h_default_dt = 'dt->iso8601_full' @@ -223,39 +228,41 @@ h_cal_year = 'cal->year' h_cal_month = 'cal->month' regex_handlers = [ - (r_time_in_epoch_s_now, handler_time_now_local, QUERY_TYPE_DT, h_unix_s), - (r_time_in_epoch_s_now, handler_time_now_utc, QUERY_TYPE_DT, h_unix_s), - (r_time_in_epoch_s2, handler_generic_parser, QUERY_TYPE_DT, h_unix_s), - (r_time_in_epoch_s3, handler_generic_parser, QUERY_TYPE_DT, h_unix_s), - (r_time_in_epoch_ms_now, handler_time_now_local, QUERY_TYPE_DT, h_unix_ms), - (r_time_in_epoch_ms_now, handler_time_now_utc, QUERY_TYPE_DT, h_unix_ms), - (r_time_in_epoch_ms2, handler_generic_parser, QUERY_TYPE_DT, h_unix_ms), - (r_time_in_epoch_ms3, handler_generic_parser, QUERY_TYPE_DT, h_unix_ms), - (r_timezone_translation, handler_timezone_translation, QUERY_TYPE_DT_TR, h_translation), - (r_timezone_translation_in_to, handler_timezone_translation_in_to, QUERY_TYPE_DT_TR, h_translation), - (r_time_since, handler_time_since_until, QUERY_TYPE_TD, h_default_td), - (r_time_until, handler_time_since_until, QUERY_TYPE_TD, h_default_td), - (r_time_between, handler_time_diff, QUERY_TYPE_TD, h_default_td), - (r_tz_between, handler_tz_diff, QUERY_TYPE_TD, h_default_td), - (r_time_plus, handler_time_plus, QUERY_TYPE_DT, h_default_dt), - (r_time_plus, handler_time_plus_rev, QUERY_TYPE_DT, h_default_dt), - (r_time_minus, handler_time_minus, QUERY_TYPE_DT, h_default_dt), - (r_time_before, handler_time_before, QUERY_TYPE_DT, h_default_dt), - (r_weekday_pre, handler_generic_parser, QUERY_TYPE_DT, h_day_of_week), - (r_weekday_post, handler_generic_parser, QUERY_TYPE_DT, h_day_of_week), - (r_workdays_since, handler_workdays_since_until, QUERY_TYPE_TD, h_default_td), - (r_workdays_until, handler_workdays_since_until, QUERY_TYPE_TD, h_default_td), - (r_workdays_between, handler_workdays_diff, QUERY_TYPE_TD, h_default_td), - (r_workhours_since, handler_workhours_since_until, QUERY_TYPE_TD, h_default_td), - (r_workhours_until, handler_workhours_since_until, QUERY_TYPE_TD, h_default_td), - (r_workhours_between, handler_workhours_diff, QUERY_TYPE_TD, h_default_td), - (r_time_in, handler_time_in_parser, QUERY_TYPE_DT, h_time_in), - (r_timezone, handler_timezone, QUERY_TYPE_TZ, h_tz_offset), - (r_timezone_2, handler_timezone, QUERY_TYPE_TZ, h_tz_offset), - (r_hour_minute_timezone, handler_timezone_creation, QUERY_TYPE_DT_TR, h_translation), - (r_calendar_year, handler_calendar, QUERY_TYPE_CAL, h_cal_year), - (r_calendar_month, handler_calendar, QUERY_TYPE_CAL, h_cal_month), - (r_generic, handler_generic_parser, QUERY_TYPE_DT, h_default_dt), + (r_time_in_epoch_s_now, handler_time_now_local, QUERY_TYPE_DT, [h_unix_s]), + (r_time_in_epoch_s_now, handler_time_now_utc, QUERY_TYPE_DT, [h_unix_s]), + (r_time_in_epoch_s2, handler_generic_parser, QUERY_TYPE_DT, [h_unix_s]), + (r_time_in_epoch_s3, handler_generic_parser, QUERY_TYPE_DT, [h_unix_s]), + (r_time_in_epoch_ms_now, handler_time_now_local, QUERY_TYPE_DT, [h_unix_ms]), + (r_time_in_epoch_ms_now, handler_time_now_utc, QUERY_TYPE_DT, [h_unix_ms]), + (r_time_in_epoch_ms2, handler_generic_parser, QUERY_TYPE_DT, [h_unix_ms]), + (r_time_in_epoch_ms3, handler_generic_parser, QUERY_TYPE_DT, [h_unix_ms]), + (r_timezone_translation, handler_timezone_translation, QUERY_TYPE_DT_TR, [h_translation]), + (r_timezone_translation_in_to, handler_timezone_translation_in_to, QUERY_TYPE_DT_TR, [h_translation]), + (r_time_since, handler_time_since_until, QUERY_TYPE_TD, [h_default_td]), + (r_time_until, handler_time_since_until, QUERY_TYPE_TD, [h_default_td]), + (r_time_between, handler_time_diff, QUERY_TYPE_TD, [h_default_td]), + (r_tz_between, handler_tz_diff, QUERY_TYPE_TD, [h_default_td]), + (r_time_plus, handler_time_plus, QUERY_TYPE_DT, [h_default_dt]), + (r_time_plus, handler_time_plus_rev, QUERY_TYPE_DT, [h_default_dt]), + (r_time_minus, handler_time_minus, QUERY_TYPE_DT, [h_default_dt]), + (r_time_before, handler_time_before, QUERY_TYPE_DT, [h_default_dt]), + (r_weekday_pre, handler_generic_parser, QUERY_TYPE_DT, [h_day_of_week]), + (r_weekday_post, handler_generic_parser, QUERY_TYPE_DT, [h_day_of_week]), + (r_workdays_since, handler_workdays_since_until, QUERY_TYPE_TD, [h_default_td]), + (r_workdays_until, handler_workdays_since_until, QUERY_TYPE_TD, [h_default_td]), + (r_workdays_between, handler_workdays_diff, QUERY_TYPE_TD, [h_default_td]), + (r_workhours_since, handler_workhours_since_until, QUERY_TYPE_TD, [h_default_td]), + (r_workhours_until, handler_workhours_since_until, QUERY_TYPE_TD, [h_default_td]), + (r_workhours_between, handler_workhours_diff, QUERY_TYPE_TD, [h_default_td]), + (r_time_in, handler_time_in_parser, QUERY_TYPE_DT, [h_time_in]), + (r_timezone, handler_timezone, QUERY_TYPE_TZ, [h_tz_offset]), + (r_timezone_cities, handler_timezone, QUERY_TYPE_TZ, [h_tz_iana, h_tz_abbr]), + (r_timezone_2, handler_timezone, QUERY_TYPE_TZ, [h_tz_offset]), + (r_hour_minute_timezone, handler_timezone_creation, QUERY_TYPE_DT_TR, [h_translation]), + (r_calendar_year, handler_calendar, QUERY_TYPE_CAL, [h_cal_year]), + (r_calendar_month, handler_calendar, QUERY_TYPE_CAL, [h_cal_month]), + (r_weeknum, handler_generic_parser, QUERY_TYPE_DT, [h_week_num]), + (r_generic, handler_generic_parser, QUERY_TYPE_DT, [h_default_dt]), ] @@ -271,20 +278,23 @@ def try_regex(r, s): def tokenize(s): solutions = [] - for r, h, t, hi in regex_handlers: + for r, h, t, his in regex_handlers: logger.debug("Trying regex: {}".format(r)) g = try_regex(r, s) if g is not None: try: logger.debug("Matched regex: {}".format(r)) - logger.debug("Running handler: {} | query_type: {} | output_type: {}".format(h.__name__, t, hi)) + logger.debug("Running handler: {} | query_type: {} | output_type: {}".format(h.__name__, t, his)) result = h(*g) except Exception as e: logger.debug("Exception from handler: {} -> {}".format(h.__name__, e)) continue if result is not None: logger.debug("Matched regex: {}".format(r)) - solutions.append((h.__name__, result, t, hi)) + for hi in his: + solution = h.__name__, result, t, hi + if solution: + solutions.append(solution) return solutions @@ -295,7 +305,7 @@ def pretty_print_dict(obj): return colorful_json -def show_magic_results(obj, args, results=1): +def show_magic_results(obj, args): rv = [] for solution in obj['solutions']: entry_proxy = Cut(solution, sep='->') @@ -310,14 +320,14 @@ def show_magic_results(obj, args, results=1): continue if args.handlers: # TODO: not working yet: - user_handlers = args.handlers.split(',') - for user_handler in user_handlers: - to_print = "{} -> {}".format(user_handler, highlight_result) + # user_handlers = args.handlers.split(',') + # for user_handler in user_handlers: + to_print = "{} -> {}".format(solution['handler'], highlight_result) else: to_print = highlight_result rv.append(to_print) print(to_print) - if len(rv) >= results: + if len(rv) >= args.results: break return rv @@ -331,6 +341,7 @@ def dt_pretty(dt): rv["iso8601_date"] = dt.strftime('%Y-%m-%d') rv["iso8601_time"] = dt.strftime('%H:%M:%S') rv["day_of_week_number"] = dt.strftime("%w") + rv["week_number"] = dt.strftime("%W") rv["locale"] = custom_locale with setlocale(locale.LC_TIME, custom_locale.get("lc_time")): rv["locale_month"] = dt.strftime("%B") @@ -441,7 +452,8 @@ def parse_args(): parser.add_argument('query', nargs='*', default="", help="freeform") parser.add_argument('--locale', dest='locale') parser.add_argument('--tz', dest='tz', default='local') - parser.add_argument('--handlers', dest='handlers') + parser.add_argument('--handlers', dest='handlers', action='store_true') + parser.add_argument('--results', dest='results', type=int, default=10) parser.add_argument('--show', dest='show') parser.add_argument('--full', dest='full', action='store_true') parser.add_argument('--debug', dest='debug', action='store_true')