tokenizer next

This commit is contained in:
Daniel Tsvetkov 2019-12-13 15:16:42 +01:00
parent 3109540c08
commit f13d40f2ee
3 changed files with 292 additions and 233 deletions

View File

@ -1,193 +1,6 @@
import datetime
import os
from datetime import datetime, timedelta
import pytz
from dateparser import parse as parse_dt
from dateparser.timezone_parser import StaticTzInfo
from dateutil.tz import tzlocal, gettz
from tww import parse_query, solve_query, TZ_OFFSETS
def time_ago(date=None, diff=None):
"""
Get a datetime object, timedelta object or a int() Epoch timestamp and
return a
pretty string like 'an hour ago', 'Yesterday', '3 months ago',
'just now', etc
Modified from: http://stackoverflow.com/a/1551394/141084
"""
now = get_utcnow()
if not date:
if diff:
diff = timedelta(seconds=diff)
else:
diff = now - now
else:
if type(date) is str:
parsed_dt = parse_dt(date)
if parsed_dt.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - parsed_dt
elif type(date) is timedelta:
diff = date
elif type(date) is int:
diff = now - datetime.fromtimestamp(date)
elif isinstance(date, datetime):
if date.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - date
else:
raise ValueError('invalid date %s of type %s' % (date, type(date)))
sign = ''
if diff.days < 0:
diff = -diff
sign = '-'
second_diff = diff.seconds
day_diff = diff.days
if day_diff == 0:
if second_diff < 60:
return str("{}{:02d}".format(sign, second_diff))
seconds = second_diff % 60
if second_diff < 3600:
min_diff = second_diff // 60
return str("{}{:02d}:{:02d}".format(sign, min_diff, seconds))
minutes = second_diff % 3600 // 60
if second_diff < 86400:
hrs_diff = second_diff // 3600
return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes,
seconds))
seconds = second_diff % 60
minutes = second_diff % 3600 // 60
hours = second_diff // 3600
if day_diff < 365:
if day_diff < 30:
return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours,
minutes, seconds))
months, days = day_diff // 30, day_diff % 30
return str(
"{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours,
minutes, seconds))
years = day_diff // 365
days = day_diff % 365
months, days = days // 30, days % 30
return str(
"{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days,
hours, minutes, seconds))
def query_to_dt(query):
human_dt, human_tz_loc = parse_query(query)
return solve_query(human_dt, human_tz_loc)
# ================== # ==================
def tzinfo_from_offset(offset: str) -> pytz.timezone: from tww import dateparser_parse_dt, get_utcnow, dt_tz_translation, get_local_tzname_iana, get_local_tz_offset, \
if ':' in offset: tzname_to_tz_offset, get_s_since_epoch, time_to_emoji
offset = ''.join(offset.split(':'))
tznames = TZ_OFFSETS.get(offset)
if not tznames:
return pytz.timezone("utc")
for tzname in tznames:
if tzname.startswith('Etc/GMT'):
return pytz.timezone(tzname)
return pytz.timezone(tznames[0])
def dateparser_parse_dt(s: str):
# print("Dateparser query: {}".format(s))
parsed = parse_dt(s)
# print("Dateparser parsed query: {}".format(parsed))
if not parsed:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=pytz.timezone("utc"))
return parsed
def get_utcnow(tzaware: bool = True):
if tzaware:
return datetime.utcnow().replace(tzinfo=pytz.UTC)
return datetime.utcnow()
def dt_tz_translation(dt: datetime, to_tz_offset: str,
from_tz_offset: str = "+00:00") -> datetime:
if ':' in to_tz_offset:
to_shh, to_mm = to_tz_offset.split(':')
else:
to_shh, to_mm = to_tz_offset[:-2], to_tz_offset[-2:]
if ':' in from_tz_offset:
from_shh, from_mm = from_tz_offset.split(':')
else:
from_shh, from_mm = from_tz_offset[:-2], to_tz_offset[-2:]
tzinfo = tzinfo_from_offset(to_tz_offset)
if dt.tzinfo:
return dt.astimezone(tzinfo)
r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta(
hours=int(from_shh), minutes=int(from_mm))
r_dt = tzinfo.localize(r_dt)
return r_dt
def format_offset_from_timedelta(tdelta: timedelta):
sign = "+" if tdelta.seconds >= 0 else "-"
h = tdelta.seconds // 3600
m = (tdelta.seconds // 60) - h * 60
return "{}{:02d}:{:02d}".format(sign, h, m)
def get_local_tzname_iana():
return '/'.join(os.path.realpath(gettz()._filename).split('/')[-2:])
def get_local_tz_offset():
return format_offset_from_timedelta(
datetime.now(tzlocal()).tzinfo._std_offset)
def tzname_to_tz_offset(tzname_iana: str):
return format_offset_from_timedelta(
pytz.timezone(tzname_iana).utcoffset(get_utcnow(False)))
def get_dt_tz_offset(dt: datetime) -> timedelta:
if dt.tzinfo is not None:
if type(dt.tzinfo) is StaticTzInfo:
tzoffset = dt.tzinfo._StaticTzInfo__offset
else:
tzoffset = dt.tzinfo._utcoffset
return tzoffset
return timedelta(0)
def get_us_since_epoch(dt: datetime):
utc_seconds = int(dt.timestamp() * 1e6)
if dt.tzinfo is None:
return utc_seconds
local_seconds = get_dt_tz_offset(dt).seconds
return utc_seconds + local_seconds
def get_ms_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e3)
def get_s_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e6)
def epoch_to_dt(seconds):
return datetime.fromtimestamp(seconds)
def time_to_emoji(dt):
seconds = get_s_since_epoch(dt)
a = int((seconds / 900 - 3) / 2 % 24)
return chr(128336 + a // 2 + a % 2 * 12)
if __name__ == "__main__": if __name__ == "__main__":
print(get_local_tzname_iana()) print(get_local_tzname_iana())

View File

@ -1,42 +1,21 @@
import re import re
from datetime import timedelta import sys
from datetime import timedelta, datetime
from src.tww.time_lib import dateparser_parse_dt, get_utcnow, \ from tww import resolve_timezone, dateparser_parse_dt, get_utcnow, get_s_since_epoch, get_ms_since_epoch, \
get_s_since_epoch, \ dt_tz_translation, DEFAULT_FORMAT, get_local_now
get_ms_since_epoch
r_time_in_epoch_s = re.compile( r_generic = re.compile('(.*)', flags=re.IGNORECASE)
'\s*(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)\s*', r_time_in_epoch_s_now = re.compile('(?:time since epoch|seconds since epoch)', flags=re.IGNORECASE)
flags=re.IGNORECASE) r_time_in_epoch_s2 = re.compile('(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)', flags=re.IGNORECASE)
r_time_in_epoch_s2 = re.compile('\s*(?:seconds)?\s*since\s*(.*)\s*', r_time_in_epoch_s3 = re.compile('(?:seconds)?\s*since\s*(.*)', flags=re.IGNORECASE)
flags=re.IGNORECASE) r_time_in_epoch_ms_now = re.compile('(?:milliseconds since epoch)', flags=re.IGNORECASE)
r_time_in_epoch_ms = re.compile( r_time_in_epoch_ms2 = re.compile('(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)', flags=re.IGNORECASE)
'\s*(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)\s*', r_time_in_epoch_ms3 = re.compile('(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)', flags=re.IGNORECASE)
flags=re.IGNORECASE) r_time_since = re.compile('(?:time)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_in_epoch_ms2 = re.compile( r_time_until = re.compile('(?:time)?\s*until\s*(.*)', flags=re.IGNORECASE)
'\s*(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)\s*', r_time_between = re.compile('(?:time)?\s*between\s*(.*)\s*and\s*(.*)', flags=re.IGNORECASE)
flags=re.IGNORECASE) r_timezone_translation = re.compile('(.*)?\s(?:in|to)\s(.*)', flags=re.IGNORECASE)
r_time_since = re.compile('\s*(?:time)?\s*since\s*(.*)\s*', flags=re.IGNORECASE)
r_time_until = re.compile('\s*(?:time)?\s*until\s*(.*)\s*', flags=re.IGNORECASE)
r_time_between = re.compile('\s*(?:time)?\s*between\s*(.*)\s*and\s*(.*)\s*',
flags=re.IGNORECASE)
test_strings = [
None,
"",
"s",
" ",
"Time since 2019-05-12",
"Since yesterday",
"time between yesterday and tomorrow",
"time until 25 december",
"time sinc",
"now in milliseconds",
"seconds since epoch",
"1992-01-27 to epoch",
"milliseconds since 1992-01-27",
]
def handler_time_s(dt_s: str) -> int: def handler_time_s(dt_s: str) -> int:
@ -47,6 +26,22 @@ def handler_time_ms(dt_s: str) -> int:
return get_ms_since_epoch(dateparser_parse_dt(dt_s)) return get_ms_since_epoch(dateparser_parse_dt(dt_s))
def handler_time_s_now_local() -> int:
return get_s_since_epoch(get_local_now())
def handler_time_ms_now_local() -> int:
return get_ms_since_epoch(get_local_now())
def handler_time_s_now_utc() -> int:
return get_s_since_epoch(get_utcnow())
def handler_time_ms_now_utc() -> int:
return get_ms_since_epoch(get_utcnow())
def handler_time_diff(start_dt: str, end_dt: str) -> timedelta: def handler_time_diff(start_dt: str, end_dt: str) -> timedelta:
return dateparser_parse_dt(end_dt) - dateparser_parse_dt(start_dt) return dateparser_parse_dt(end_dt) - dateparser_parse_dt(start_dt)
@ -59,14 +54,32 @@ def handler_time_until(end_dt_s: str) -> timedelta:
return handler_time_diff(str(get_utcnow()), end_dt_s) return handler_time_diff(str(get_utcnow()), end_dt_s)
def handler_timezone_translation(dt_s: str, timezone_like_s: str) -> datetime:
dt = dateparser_parse_dt(dt_s)
tz = resolve_timezone(timezone_like_s)
if tz:
offset = tz.get('tz_offset')
return dt_tz_translation(dt, offset)
def handler_generic_parser(dt_s: str) -> datetime:
return dateparser_parse_dt(dt_s)
regex_handlers = [ regex_handlers = [
(r_time_in_epoch_s, handler_time_s), (r_time_in_epoch_s_now, handler_time_s_now_local),
(r_time_in_epoch_s_now, handler_time_s_now_utc),
(r_time_in_epoch_s2, handler_time_s), (r_time_in_epoch_s2, handler_time_s),
(r_time_in_epoch_ms, handler_time_ms), (r_time_in_epoch_s3, handler_time_s),
(r_time_in_epoch_ms_now, handler_time_ms_now_local),
(r_time_in_epoch_ms_now, handler_time_ms_now_utc),
(r_time_in_epoch_ms2, handler_time_ms), (r_time_in_epoch_ms2, handler_time_ms),
(r_time_in_epoch_ms3, handler_time_ms),
(r_timezone_translation, handler_timezone_translation),
(r_time_since, handler_time_since), (r_time_since, handler_time_since),
(r_time_until, handler_time_until), (r_time_until, handler_time_until),
(r_time_between, handler_time_diff), (r_time_between, handler_time_diff),
(r_generic, handler_generic_parser),
] ]
@ -81,15 +94,52 @@ def try_regex(r, s):
def parse(s): def parse(s):
solutions = []
for r, h in regex_handlers: for r, h in regex_handlers:
g = try_regex(r, s) g = try_regex(r, s)
if g is not None: if g is not None:
try: try:
return h(*g) result = h(*g)
except: except Exception as e:
continue result = None
solutions.append((h.__name__, result))
return solutions
def test():
test_strings = [
None,
"",
"s",
" ",
"Time since 2019-05-12",
"Since yesterday",
"time between yesterday and tomorrow",
"time until 25 december",
"time sinc",
"now in milliseconds",
"seconds since epoch",
"1992-01-27 to epoch",
"milliseconds since 1992-01-27",
"now in sofia",
"now in PST",
"2 hours ago to Sydney",
"now in +03:00",
"now in dublin",
]
for s in test_strings:
print("{} -> {}".format(s, parse(s)))
if __name__ == "__main__": if __name__ == "__main__":
for s in test_strings: query = ' '.join(sys.argv[1:])
print("{} -> {}".format(s, parse(s))) results = parse(query)
for handler, result in results:
if type(result) is datetime:
print(" {} -> {}".format(handler, result.strftime(DEFAULT_FORMAT)))
elif type(result) is timedelta:
print(" {} -> {}".format(handler, result))
elif type(result) is None:
print(" {} -> Couldn't solve query".format(handler))
else:
print(" {} -> {}".format(handler, result))

View File

@ -4,14 +4,19 @@ Find time now, in the past or future in any timezone or location.
""" """
import argparse import argparse
import datetime
import logging import logging
import os import os
from collections import defaultdict from collections import defaultdict
import dateparser import dateparser
from datetime import datetime from datetime import datetime, timedelta
import pytz import pytz
from dateparser import parse as parse_dt
from dateutil.parser import parse as dutil_parse
from dateparser.timezone_parser import StaticTzInfo
from dateutil.tz import gettz, tzlocal
from pytz import timezone from pytz import timezone
from pytz.exceptions import UnknownTimeZoneError from pytz.exceptions import UnknownTimeZoneError
@ -249,10 +254,16 @@ def resolve_timezone(query):
normal_query = query.lower().strip() normal_query = query.lower().strip()
found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "") found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "")
found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set())) found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set()))
found_from_offset_tz = None
try:
found_from_offset_tz = tzinfo_from_offset(normal_query)
except: ...
normal_tz = found_from_iana_tz normal_tz = found_from_iana_tz
if not normal_tz: if not normal_tz:
if found_from_abbr_tzs: if found_from_abbr_tzs:
normal_tz = list(found_from_abbr_tzs)[0] normal_tz = list(found_from_abbr_tzs)[0]
elif found_from_offset_tz:
normal_tz = found_from_offset_tz.zone
tz_abbrs = list(TZ_ABBRS_REVERSE.get(normal_tz, set())) tz_abbrs = list(TZ_ABBRS_REVERSE.get(normal_tz, set()))
logger.debug("Normalized timezone: {} -> {}".format(query, normal_tz)) logger.debug("Normalized timezone: {} -> {}".format(query, normal_tz))
local_location, remote_location = {}, {} local_location, remote_location = {}, {}
@ -293,6 +304,7 @@ def resolve_timezone(query):
"normal_query": normal_query, "normal_query": normal_query,
"found_from_iana_tz": found_from_iana_tz, "found_from_iana_tz": found_from_iana_tz,
"found_from_abbr_tzs": found_from_abbr_tzs, "found_from_abbr_tzs": found_from_abbr_tzs,
"found_from_offset_tzs": found_from_offset_tz,
"local_location": local_location, "local_location": local_location,
"remote_location": remote_location, "remote_location": remote_location,
"search_pytz": normal_tz, "search_pytz": normal_tz,
@ -348,3 +360,187 @@ if __name__ == "__main__":
args = parse_args() args = parse_args()
setup_logging_level(args.debug) setup_logging_level(args.debug)
main(args) main(args)
def time_ago(date=None, diff=None):
"""
Get a datetime object, timedelta object or a int() Epoch timestamp and
return a
pretty string like 'an hour ago', 'Yesterday', '3 months ago',
'just now', etc
Modified from: http://stackoverflow.com/a/1551394/141084
"""
now = get_utcnow()
if not date:
if diff:
diff = timedelta(seconds=diff)
else:
diff = now - now
else:
if type(date) is str:
parsed_dt = parse_dt(date)
if parsed_dt.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - parsed_dt
elif type(date) is timedelta:
diff = date
elif type(date) is int:
diff = now - datetime.fromtimestamp(date)
elif isinstance(date, datetime):
if date.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - date
else:
raise ValueError('invalid date %s of type %s' % (date, type(date)))
sign = ''
if diff.days < 0:
diff = -diff
sign = '-'
second_diff = diff.seconds
day_diff = diff.days
if day_diff == 0:
if second_diff < 60:
return str("{}{:02d}".format(sign, second_diff))
seconds = second_diff % 60
if second_diff < 3600:
min_diff = second_diff // 60
return str("{}{:02d}:{:02d}".format(sign, min_diff, seconds))
minutes = second_diff % 3600 // 60
if second_diff < 86400:
hrs_diff = second_diff // 3600
return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes,
seconds))
seconds = second_diff % 60
minutes = second_diff % 3600 // 60
hours = second_diff // 3600
if day_diff < 365:
if day_diff < 30:
return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours,
minutes, seconds))
months, days = day_diff // 30, day_diff % 30
return str(
"{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours,
minutes, seconds))
years = day_diff // 365
days = day_diff % 365
months, days = days // 30, days % 30
return str(
"{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days,
hours, minutes, seconds))
def query_to_dt(query):
human_dt, human_tz_loc = parse_query(query)
return solve_query(human_dt, human_tz_loc)
def tzinfo_from_offset(offset: str) -> pytz.timezone:
if ':' in offset:
offset = ''.join(offset.split(':'))
tznames = TZ_OFFSETS.get(offset)
for tzname in tznames:
if tzname.startswith('Etc/GMT'):
return pytz.timezone(tzname)
return pytz.timezone(tznames[0])
def dateparser_parse_dt(s: str):
# print("Dateparser query: {}".format(s))
parsed = parse_dt(s)
# print("Dateparser parsed query: {}".format(parsed))
if not parsed:
parsed = dutil_parse(s)
if not parsed:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=pytz.timezone("utc"))
return parsed
def get_utcnow(tzaware: bool = True):
if tzaware:
return datetime.utcnow().replace(tzinfo=pytz.UTC)
return datetime.utcnow()
def get_local_now(tzaware: bool = True):
if tzaware:
return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset()))
return datetime.utcnow()
def dt_tz_translation(dt: datetime, to_tz_offset: str, from_tz_offset: str = "+00:00") -> datetime:
if ':' in to_tz_offset:
to_shh, to_mm = to_tz_offset.split(':')
else:
to_shh, to_mm = to_tz_offset[:-2], to_tz_offset[-2:]
if ':' in from_tz_offset:
from_shh, from_mm = from_tz_offset.split(':')
else:
from_shh, from_mm = from_tz_offset[:-2], to_tz_offset[-2:]
tzinfo = tzinfo_from_offset(to_tz_offset)
if dt.tzinfo:
return dt.astimezone(tzinfo)
r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta(
hours=int(from_shh), minutes=int(from_mm))
r_dt = tzinfo.localize(r_dt)
return r_dt
def format_offset_from_timedelta(tdelta: timedelta):
sign = "+" if tdelta.seconds >= 0 else "-"
h = tdelta.seconds // 3600
m = (tdelta.seconds // 60) - h * 60
return "{}{:02d}:{:02d}".format(sign, h, m)
def get_local_tzname_iana():
return '/'.join(os.path.realpath(gettz()._filename).split('/')[-2:])
def get_local_tz_offset():
return format_offset_from_timedelta(
datetime.now(tzlocal()).tzinfo._std_offset)
def tzname_to_tz_offset(tzname_iana: str):
return format_offset_from_timedelta(
pytz.timezone(tzname_iana).utcoffset(get_utcnow(False)))
def get_dt_tz_offset(dt: datetime) -> timedelta:
if dt.tzinfo is not None:
if type(dt.tzinfo) is StaticTzInfo:
tzoffset = dt.tzinfo._StaticTzInfo__offset
else:
tzoffset = dt.tzinfo._utcoffset
return tzoffset
return timedelta(0)
def get_us_since_epoch(dt: datetime):
utc_seconds = int(dt.timestamp() * 1e6)
if dt.tzinfo is None:
return utc_seconds
local_seconds = get_dt_tz_offset(dt).seconds
return utc_seconds + local_seconds
def get_ms_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e3)
def get_s_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e6)
def epoch_to_dt(seconds):
return datetime.fromtimestamp(seconds)
def time_to_emoji(dt):
seconds = get_s_since_epoch(dt)
a = int((seconds / 900 - 3) / 2 % 24)
return chr(128336 + a // 2 + a % 2 * 12)