From f13d40f2ee723c5663f5337789839e6b24a89b6d Mon Sep 17 00:00:00 2001 From: Daniel Tsvetkov Date: Fri, 13 Dec 2019 15:16:42 +0100 Subject: [PATCH] tokenizer next --- src/tww/time_lib.py | 191 +---------------------------------------- src/tww/tokenizer.py | 136 +++++++++++++++++++---------- src/tww/tww.py | 198 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 292 insertions(+), 233 deletions(-) diff --git a/src/tww/time_lib.py b/src/tww/time_lib.py index 96f10fa..3619df4 100644 --- a/src/tww/time_lib.py +++ b/src/tww/time_lib.py @@ -1,193 +1,6 @@ -import datetime -import os -from datetime import datetime, timedelta - -import pytz -from dateparser import parse as parse_dt -from dateparser.timezone_parser import StaticTzInfo -from dateutil.tz import tzlocal, gettz -from tww import parse_query, solve_query, TZ_OFFSETS - - -def time_ago(date=None, diff=None): - """ - Get a datetime object, timedelta object or a int() Epoch timestamp and - return a - pretty string like 'an hour ago', 'Yesterday', '3 months ago', - 'just now', etc - Modified from: http://stackoverflow.com/a/1551394/141084 - """ - now = get_utcnow() - if not date: - if diff: - diff = timedelta(seconds=diff) - else: - diff = now - now - else: - if type(date) is str: - parsed_dt = parse_dt(date) - if parsed_dt.tzinfo is not None: - now = get_utcnow(tzaware=True) - diff = now - parsed_dt - elif type(date) is timedelta: - diff = date - elif type(date) is int: - diff = now - datetime.fromtimestamp(date) - elif isinstance(date, datetime): - if date.tzinfo is not None: - now = get_utcnow(tzaware=True) - diff = now - date - else: - raise ValueError('invalid date %s of type %s' % (date, type(date))) - - sign = '' - if diff.days < 0: - diff = -diff - sign = '-' - second_diff = diff.seconds - day_diff = diff.days - - if day_diff == 0: - if second_diff < 60: - return str("{}{:02d}".format(sign, second_diff)) - seconds = second_diff % 60 - if second_diff < 3600: - min_diff = second_diff // 60 - return str("{}{:02d}:{:02d}".format(sign, min_diff, seconds)) - minutes = second_diff % 3600 // 60 - if second_diff < 86400: - hrs_diff = second_diff // 3600 - return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes, - seconds)) - seconds = second_diff % 60 - minutes = second_diff % 3600 // 60 - hours = second_diff // 3600 - if day_diff < 365: - if day_diff < 30: - return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours, - minutes, seconds)) - months, days = day_diff // 30, day_diff % 30 - return str( - "{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours, - minutes, seconds)) - years = day_diff // 365 - days = day_diff % 365 - months, days = days // 30, days % 30 - return str( - "{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days, - hours, minutes, seconds)) - - -def query_to_dt(query): - human_dt, human_tz_loc = parse_query(query) - return solve_query(human_dt, human_tz_loc) - - # ================== -def tzinfo_from_offset(offset: str) -> pytz.timezone: - if ':' in offset: - offset = ''.join(offset.split(':')) - tznames = TZ_OFFSETS.get(offset) - if not tznames: - return pytz.timezone("utc") - for tzname in tznames: - if tzname.startswith('Etc/GMT'): - return pytz.timezone(tzname) - return pytz.timezone(tznames[0]) - - -def dateparser_parse_dt(s: str): - # print("Dateparser query: {}".format(s)) - parsed = parse_dt(s) - # print("Dateparser parsed query: {}".format(parsed)) - if not parsed: - return None - if parsed.tzinfo is None: - parsed = parsed.replace(tzinfo=pytz.timezone("utc")) - return parsed - - -def get_utcnow(tzaware: bool = True): - if tzaware: - return datetime.utcnow().replace(tzinfo=pytz.UTC) - return datetime.utcnow() - - -def dt_tz_translation(dt: datetime, to_tz_offset: str, - from_tz_offset: str = "+00:00") -> datetime: - if ':' in to_tz_offset: - to_shh, to_mm = to_tz_offset.split(':') - else: - to_shh, to_mm = to_tz_offset[:-2], to_tz_offset[-2:] - if ':' in from_tz_offset: - from_shh, from_mm = from_tz_offset.split(':') - else: - from_shh, from_mm = from_tz_offset[:-2], to_tz_offset[-2:] - tzinfo = tzinfo_from_offset(to_tz_offset) - if dt.tzinfo: - return dt.astimezone(tzinfo) - r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta( - hours=int(from_shh), minutes=int(from_mm)) - r_dt = tzinfo.localize(r_dt) - return r_dt - - -def format_offset_from_timedelta(tdelta: timedelta): - sign = "+" if tdelta.seconds >= 0 else "-" - h = tdelta.seconds // 3600 - m = (tdelta.seconds // 60) - h * 60 - return "{}{:02d}:{:02d}".format(sign, h, m) - - -def get_local_tzname_iana(): - return '/'.join(os.path.realpath(gettz()._filename).split('/')[-2:]) - - -def get_local_tz_offset(): - return format_offset_from_timedelta( - datetime.now(tzlocal()).tzinfo._std_offset) - - -def tzname_to_tz_offset(tzname_iana: str): - return format_offset_from_timedelta( - pytz.timezone(tzname_iana).utcoffset(get_utcnow(False))) - - -def get_dt_tz_offset(dt: datetime) -> timedelta: - if dt.tzinfo is not None: - if type(dt.tzinfo) is StaticTzInfo: - tzoffset = dt.tzinfo._StaticTzInfo__offset - else: - tzoffset = dt.tzinfo._utcoffset - return tzoffset - return timedelta(0) - - -def get_us_since_epoch(dt: datetime): - utc_seconds = int(dt.timestamp() * 1e6) - if dt.tzinfo is None: - return utc_seconds - local_seconds = get_dt_tz_offset(dt).seconds - return utc_seconds + local_seconds - - -def get_ms_since_epoch(dt): - return int(get_us_since_epoch(dt) / 1e3) - - -def get_s_since_epoch(dt): - return int(get_us_since_epoch(dt) / 1e6) - - -def epoch_to_dt(seconds): - return datetime.fromtimestamp(seconds) - - -def time_to_emoji(dt): - seconds = get_s_since_epoch(dt) - a = int((seconds / 900 - 3) / 2 % 24) - return chr(128336 + a // 2 + a % 2 * 12) - +from tww import dateparser_parse_dt, get_utcnow, dt_tz_translation, get_local_tzname_iana, get_local_tz_offset, \ + tzname_to_tz_offset, get_s_since_epoch, time_to_emoji if __name__ == "__main__": print(get_local_tzname_iana()) diff --git a/src/tww/tokenizer.py b/src/tww/tokenizer.py index e8e9687..7f77aee 100644 --- a/src/tww/tokenizer.py +++ b/src/tww/tokenizer.py @@ -1,42 +1,21 @@ import re -from datetime import timedelta +import sys +from datetime import timedelta, datetime -from src.tww.time_lib import dateparser_parse_dt, get_utcnow, \ - get_s_since_epoch, \ - get_ms_since_epoch +from tww import resolve_timezone, dateparser_parse_dt, get_utcnow, get_s_since_epoch, get_ms_since_epoch, \ + dt_tz_translation, DEFAULT_FORMAT, get_local_now -r_time_in_epoch_s = re.compile( - '\s*(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)\s*', - flags=re.IGNORECASE) -r_time_in_epoch_s2 = re.compile('\s*(?:seconds)?\s*since\s*(.*)\s*', - flags=re.IGNORECASE) -r_time_in_epoch_ms = re.compile( - '\s*(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)\s*', - flags=re.IGNORECASE) -r_time_in_epoch_ms2 = re.compile( - '\s*(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)\s*', - flags=re.IGNORECASE) -r_time_since = re.compile('\s*(?:time)?\s*since\s*(.*)\s*', flags=re.IGNORECASE) -r_time_until = re.compile('\s*(?:time)?\s*until\s*(.*)\s*', flags=re.IGNORECASE) -r_time_between = re.compile('\s*(?:time)?\s*between\s*(.*)\s*and\s*(.*)\s*', - flags=re.IGNORECASE) - -test_strings = [ - None, - "", - "s", - " ", - "Time since 2019-05-12", - "Since yesterday", - "time between yesterday and tomorrow", - "time until 25 december", - "time sinc", - "now in milliseconds", - "seconds since epoch", - "1992-01-27 to epoch", - "milliseconds since 1992-01-27", - -] +r_generic = re.compile('(.*)', flags=re.IGNORECASE) +r_time_in_epoch_s_now = re.compile('(?:time since epoch|seconds since epoch)', flags=re.IGNORECASE) +r_time_in_epoch_s2 = re.compile('(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)', flags=re.IGNORECASE) +r_time_in_epoch_s3 = re.compile('(?:seconds)?\s*since\s*(.*)', flags=re.IGNORECASE) +r_time_in_epoch_ms_now = re.compile('(?:milliseconds since epoch)', flags=re.IGNORECASE) +r_time_in_epoch_ms2 = re.compile('(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)', flags=re.IGNORECASE) +r_time_in_epoch_ms3 = re.compile('(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)', flags=re.IGNORECASE) +r_time_since = re.compile('(?:time)?\s*since\s*(.*)', flags=re.IGNORECASE) +r_time_until = re.compile('(?:time)?\s*until\s*(.*)', flags=re.IGNORECASE) +r_time_between = re.compile('(?:time)?\s*between\s*(.*)\s*and\s*(.*)', flags=re.IGNORECASE) +r_timezone_translation = re.compile('(.*)?\s(?:in|to)\s(.*)', flags=re.IGNORECASE) def handler_time_s(dt_s: str) -> int: @@ -47,6 +26,22 @@ def handler_time_ms(dt_s: str) -> int: return get_ms_since_epoch(dateparser_parse_dt(dt_s)) +def handler_time_s_now_local() -> int: + return get_s_since_epoch(get_local_now()) + + +def handler_time_ms_now_local() -> int: + return get_ms_since_epoch(get_local_now()) + + +def handler_time_s_now_utc() -> int: + return get_s_since_epoch(get_utcnow()) + + +def handler_time_ms_now_utc() -> int: + return get_ms_since_epoch(get_utcnow()) + + def handler_time_diff(start_dt: str, end_dt: str) -> timedelta: return dateparser_parse_dt(end_dt) - dateparser_parse_dt(start_dt) @@ -59,14 +54,32 @@ def handler_time_until(end_dt_s: str) -> timedelta: return handler_time_diff(str(get_utcnow()), end_dt_s) +def handler_timezone_translation(dt_s: str, timezone_like_s: str) -> datetime: + dt = dateparser_parse_dt(dt_s) + tz = resolve_timezone(timezone_like_s) + if tz: + offset = tz.get('tz_offset') + return dt_tz_translation(dt, offset) + + +def handler_generic_parser(dt_s: str) -> datetime: + return dateparser_parse_dt(dt_s) + + regex_handlers = [ - (r_time_in_epoch_s, handler_time_s), + (r_time_in_epoch_s_now, handler_time_s_now_local), + (r_time_in_epoch_s_now, handler_time_s_now_utc), (r_time_in_epoch_s2, handler_time_s), - (r_time_in_epoch_ms, handler_time_ms), + (r_time_in_epoch_s3, handler_time_s), + (r_time_in_epoch_ms_now, handler_time_ms_now_local), + (r_time_in_epoch_ms_now, handler_time_ms_now_utc), (r_time_in_epoch_ms2, handler_time_ms), + (r_time_in_epoch_ms3, handler_time_ms), + (r_timezone_translation, handler_timezone_translation), (r_time_since, handler_time_since), (r_time_until, handler_time_until), (r_time_between, handler_time_diff), + (r_generic, handler_generic_parser), ] @@ -81,15 +94,52 @@ def try_regex(r, s): def parse(s): + solutions = [] for r, h in regex_handlers: g = try_regex(r, s) if g is not None: try: - return h(*g) - except: - continue + result = h(*g) + except Exception as e: + result = None + solutions.append((h.__name__, result)) + return solutions + + +def test(): + test_strings = [ + None, + "", + "s", + " ", + "Time since 2019-05-12", + "Since yesterday", + "time between yesterday and tomorrow", + "time until 25 december", + "time sinc", + "now in milliseconds", + "seconds since epoch", + "1992-01-27 to epoch", + "milliseconds since 1992-01-27", + "now in sofia", + "now in PST", + "2 hours ago to Sydney", + "now in +03:00", + "now in dublin", + ] + for s in test_strings: + print("{} -> {}".format(s, parse(s))) if __name__ == "__main__": - for s in test_strings: - print("{} -> {}".format(s, parse(s))) + query = ' '.join(sys.argv[1:]) + results = parse(query) + for handler, result in results: + if type(result) is datetime: + print(" {} -> {}".format(handler, result.strftime(DEFAULT_FORMAT))) + elif type(result) is timedelta: + print(" {} -> {}".format(handler, result)) + elif type(result) is None: + print(" {} -> Couldn't solve query".format(handler)) + else: + print(" {} -> {}".format(handler, result)) diff --git a/src/tww/tww.py b/src/tww/tww.py index 30ab393..5e7b09e 100644 --- a/src/tww/tww.py +++ b/src/tww/tww.py @@ -4,14 +4,19 @@ Find time now, in the past or future in any timezone or location. """ import argparse +import datetime import logging import os from collections import defaultdict import dateparser -from datetime import datetime +from datetime import datetime, timedelta import pytz +from dateparser import parse as parse_dt +from dateutil.parser import parse as dutil_parse +from dateparser.timezone_parser import StaticTzInfo +from dateutil.tz import gettz, tzlocal from pytz import timezone from pytz.exceptions import UnknownTimeZoneError @@ -249,10 +254,16 @@ def resolve_timezone(query): normal_query = query.lower().strip() found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "") found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set())) + found_from_offset_tz = None + try: + found_from_offset_tz = tzinfo_from_offset(normal_query) + except: ... normal_tz = found_from_iana_tz if not normal_tz: if found_from_abbr_tzs: normal_tz = list(found_from_abbr_tzs)[0] + elif found_from_offset_tz: + normal_tz = found_from_offset_tz.zone tz_abbrs = list(TZ_ABBRS_REVERSE.get(normal_tz, set())) logger.debug("Normalized timezone: {} -> {}".format(query, normal_tz)) local_location, remote_location = {}, {} @@ -293,6 +304,7 @@ def resolve_timezone(query): "normal_query": normal_query, "found_from_iana_tz": found_from_iana_tz, "found_from_abbr_tzs": found_from_abbr_tzs, + "found_from_offset_tzs": found_from_offset_tz, "local_location": local_location, "remote_location": remote_location, "search_pytz": normal_tz, @@ -348,3 +360,187 @@ if __name__ == "__main__": args = parse_args() setup_logging_level(args.debug) main(args) + + +def time_ago(date=None, diff=None): + """ + Get a datetime object, timedelta object or a int() Epoch timestamp and + return a + pretty string like 'an hour ago', 'Yesterday', '3 months ago', + 'just now', etc + Modified from: http://stackoverflow.com/a/1551394/141084 + """ + now = get_utcnow() + if not date: + if diff: + diff = timedelta(seconds=diff) + else: + diff = now - now + else: + if type(date) is str: + parsed_dt = parse_dt(date) + if parsed_dt.tzinfo is not None: + now = get_utcnow(tzaware=True) + diff = now - parsed_dt + elif type(date) is timedelta: + diff = date + elif type(date) is int: + diff = now - datetime.fromtimestamp(date) + elif isinstance(date, datetime): + if date.tzinfo is not None: + now = get_utcnow(tzaware=True) + diff = now - date + else: + raise ValueError('invalid date %s of type %s' % (date, type(date))) + + sign = '' + if diff.days < 0: + diff = -diff + sign = '-' + second_diff = diff.seconds + day_diff = diff.days + + if day_diff == 0: + if second_diff < 60: + return str("{}{:02d}".format(sign, second_diff)) + seconds = second_diff % 60 + if second_diff < 3600: + min_diff = second_diff // 60 + return str("{}{:02d}:{:02d}".format(sign, min_diff, seconds)) + minutes = second_diff % 3600 // 60 + if second_diff < 86400: + hrs_diff = second_diff // 3600 + return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes, + seconds)) + seconds = second_diff % 60 + minutes = second_diff % 3600 // 60 + hours = second_diff // 3600 + if day_diff < 365: + if day_diff < 30: + return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours, + minutes, seconds)) + months, days = day_diff // 30, day_diff % 30 + return str( + "{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours, + minutes, seconds)) + years = day_diff // 365 + days = day_diff % 365 + months, days = days // 30, days % 30 + return str( + "{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days, + hours, minutes, seconds)) + + +def query_to_dt(query): + human_dt, human_tz_loc = parse_query(query) + return solve_query(human_dt, human_tz_loc) + + +def tzinfo_from_offset(offset: str) -> pytz.timezone: + if ':' in offset: + offset = ''.join(offset.split(':')) + tznames = TZ_OFFSETS.get(offset) + for tzname in tznames: + if tzname.startswith('Etc/GMT'): + return pytz.timezone(tzname) + return pytz.timezone(tznames[0]) + + +def dateparser_parse_dt(s: str): + # print("Dateparser query: {}".format(s)) + parsed = parse_dt(s) + # print("Dateparser parsed query: {}".format(parsed)) + if not parsed: + parsed = dutil_parse(s) + if not parsed: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=pytz.timezone("utc")) + return parsed + + +def get_utcnow(tzaware: bool = True): + if tzaware: + return datetime.utcnow().replace(tzinfo=pytz.UTC) + return datetime.utcnow() + + +def get_local_now(tzaware: bool = True): + if tzaware: + return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset())) + return datetime.utcnow() + + +def dt_tz_translation(dt: datetime, to_tz_offset: str, from_tz_offset: str = "+00:00") -> datetime: + if ':' in to_tz_offset: + to_shh, to_mm = to_tz_offset.split(':') + else: + to_shh, to_mm = to_tz_offset[:-2], to_tz_offset[-2:] + if ':' in from_tz_offset: + from_shh, from_mm = from_tz_offset.split(':') + else: + from_shh, from_mm = from_tz_offset[:-2], to_tz_offset[-2:] + tzinfo = tzinfo_from_offset(to_tz_offset) + if dt.tzinfo: + return dt.astimezone(tzinfo) + r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta( + hours=int(from_shh), minutes=int(from_mm)) + r_dt = tzinfo.localize(r_dt) + return r_dt + + +def format_offset_from_timedelta(tdelta: timedelta): + sign = "+" if tdelta.seconds >= 0 else "-" + h = tdelta.seconds // 3600 + m = (tdelta.seconds // 60) - h * 60 + return "{}{:02d}:{:02d}".format(sign, h, m) + + +def get_local_tzname_iana(): + return '/'.join(os.path.realpath(gettz()._filename).split('/')[-2:]) + + +def get_local_tz_offset(): + return format_offset_from_timedelta( + datetime.now(tzlocal()).tzinfo._std_offset) + + +def tzname_to_tz_offset(tzname_iana: str): + return format_offset_from_timedelta( + pytz.timezone(tzname_iana).utcoffset(get_utcnow(False))) + + +def get_dt_tz_offset(dt: datetime) -> timedelta: + if dt.tzinfo is not None: + if type(dt.tzinfo) is StaticTzInfo: + tzoffset = dt.tzinfo._StaticTzInfo__offset + else: + tzoffset = dt.tzinfo._utcoffset + return tzoffset + return timedelta(0) + + +def get_us_since_epoch(dt: datetime): + utc_seconds = int(dt.timestamp() * 1e6) + if dt.tzinfo is None: + return utc_seconds + local_seconds = get_dt_tz_offset(dt).seconds + return utc_seconds + local_seconds + + +def get_ms_since_epoch(dt): + return int(get_us_since_epoch(dt) / 1e3) + + +def get_s_since_epoch(dt): + return int(get_us_since_epoch(dt) / 1e6) + + +def epoch_to_dt(seconds): + return datetime.fromtimestamp(seconds) + + +def time_to_emoji(dt): + seconds = get_s_since_epoch(dt) + a = int((seconds / 900 - 3) / 2 % 24) + return chr(128336 + a // 2 + a % 2 * 12) \ No newline at end of file