# HV: Contains parsers for querying list of scans from __future__ import print_function from six import iteritems from functools import reduce from functional import * import re, hvutil, operator, math, itertools, inspect, plotiterator, copy, numpy, plotutil, collections haveQuanta = False try: import pyrap.quanta haveQuanta = True except ImportError: pass class token_type(object): __slots__ = ['type', 'value', 'position'] # tokens must AT LEAST have a type. Most other things are optional def __init__(self, tp, val=None, **kwargs): self.type = tp self.value = val self.position = kwargs.get('position', -1) def __str__(self): return "(type={0}, val={1}, pos={2})".format(self.type, self.value, self.position) def __repr__(self): return str(self) def D(x): print(x) return x def DD(x): def do_it(y): print(x,y) return y return do_it def mk_tokenizer(tokens, **env): # run all the token regexps against the text at postion p # and return a list of those that matched with their converted value(s) def get_matchobjects(txt, p): return map_(lambda mo_tp: (mo_tp[0], mo_tp[1](mo_tp[0], position=p, **env)), filter(GetN(0), map(lambda rx_tp: (rx_tp[0].match(txt, p), rx_tp[1]), tokens))) #sep = "\n -> " def do_tokenize(string): pos = 0 while pos<len(string): # Run all known regexps against the current string and filter out which one(s) matched moList = get_matchobjects(string, pos) if len(moList)==0: raise RuntimeError("\n{0}\n{1:>{2}s}^\n{3} tokens matched here".format(string, "", pos, len(moList))) # extract match-object and the token from the result list (mo, tok) = moList[0] pos += (mo.end() - mo.start()) # Ignore tokens that say they are nothing (e.g. whitespace) if tok is not None: yield tok yield token_type(None, None) return do_tokenize # helper functions def mk_number(val): try: return int(val) except ValueError: return float(val) # transform a matched date/time object into MJD seconds def mk_mjdsecs(mo, **kwargs): if not haveQuanta: raise RuntimeError("pyrap.quanta module not available for date/time to seconds functionality") # we must have 'year', 'month' and 'day' # 'h', 'm' and 's' are optional and default to 0 # We know that CASA's quantity parses # day-month-yearThhHmmMss.sssS correctly # so transform our input to that form G = mo.group # Be careful to not access groups that may not be present at all! option = lambda group: G(group) if group in mo.groupdict() and mo.groupdict()[group] else 0 return pyrap.quanta.quantity( "{0}-{1}-{2}T{3}H{4}M{5}S".format(G('day'), G('month'), G('year'), option('h'), option('m'), option('s')) ).get_value("s") # Transform relative day time stamp and durations into seconds def mk_seconds(mg, **kwargs): rv = 0.0 secday = 86400 grps = mg.groupdict() if 'd' in grps and mg.group('d'): rv += float(mg.group('d'))*secday if 'h' in grps and mg.group('h'): rv += float(mg.group('h'))*3600.0 if 'm' in grps and mg.group('m'): rv += float(mg.group('m'))*60.0 if 's' in grps and mg.group('s'): rv += float(mg.group('s')) if 'offset' in grps and mg.group('offset'): rv += (int(mg.group('offset')) + (math.floor(float(kwargs.get('start', 0))/secday))) * secday if 'neg' in grps and mg.group('neg'): rv *= -1 return rv resolver = lambda group: lambda mg, **env: env[mg.group(group)] def mk_operator(which): ops = {'+':operator.add, '-':operator.sub, '*':operator.mul, '/':operator.truediv, '^':operator.pow, '&&': operator.and_, '||': operator.or_, '!':operator.not_, 'and': operator.and_, 'or': operator.or_, 'not':operator.not_, 'in': lambda x, y: operator.contains(y, x), '=':operator.eq, '<': operator.lt, '<=':operator.le, '>=':operator.ge, '>':operator.gt } return ops[which] # Token makers token_def = lambda pattern, fn: (re.compile(pattern), fn) ignore_t = lambda : lambda o, **k: None keyword_t = lambda : lambda o, **k: token_type(o.group(0), **k) simple_t = lambda tp : lambda o, **k: token_type(tp, **k) value_t = lambda tp : lambda o, **k: token_type(tp, o.group(0), **k) # extract a given match group extract_t = lambda tp, g: lambda o, **k: token_type(tp, o.group(g), **k) # xform = transform match group 0 [ie the whole regex match] xform_t = lambda tp, f: lambda o, **k: token_type(tp, f(o.group(0)), **k) # xformmg = transform the match object; ie you have access to all match groups # xfrommge = same as previous but you have the environment passed in as well xformmg_t = lambda tp, f: lambda o, **k: token_type(tp, f(o), **k) xformmge_t = lambda tp, f: lambda o, **k: token_type(tp, f(o, **k), **k) number_t = lambda : xform_t('number', mk_number) operator_t = lambda tp : xform_t(tp, mk_operator) datetime_t = lambda f : xformmge_t('datetime', f) resolve_t = lambda tp, g: xformmge_t(tp, resolver(g)) #resolve_t = lambda tp, g: lambda o, **k: token_type(tp, resolver(g)(o, **k), **k) # helper functions to help build token regexes MAYBE = lambda x: r"("+x+r")?" NAMED = lambda n, x: r"(?P<"+n+r">"+x+r")" # Patterns for supported datetime formats YMD = r"(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<day>\d{1,2})" DMY = r"(?P<day>\d{1,2})-(?P<month>([a-zA-Z]{3}|\d{1,2}))-(?P<year>\d{4})" DMY_EUR = r"(?P<day>\d{1,2})/(?P<month>([a-zA-Z]{3}|\d{1,2}))/(?P<year>\d{4})" SEP = r"[/T]" TIME = r"(?P<h>\d{1,2}):(?P<m>\d{1,2}):(?P<s>\d{1,2}(\.\d*)?)" HMS = r"(?P<h>\d{1,2})[hH](?P<m>\d{1,2})[mM](?P<s>\d{1,2}(\.\d*)?)[sS]" RELDAY = r"(?P<offset>-?\d+)/" # Float needs a decimal point somewhere in there, int never, number is either. # note that numbers never include a leading '-' so parsers are responsible for # supporting unary '-' FLOAT = r"((\d+\.\d*)|\.\d+)([eE]-?\d+)?" INT = r"\d+" NUMBER = r"((\d+(\.\d*)?)|\.\d+)([eE]-?\d+)?" # combine a date+time format and generate a token definition out of it datetime_token = lambda date_fmt, time_fmt: token_def(date_fmt+SEP+time_fmt, datetime_t(mk_mjdsecs)) # Always nice to have. Note: if you want to support int and float at the same time # care should be taken about the order. If you put 'int_token' before 'float_token' # then you'll most likely never get a floating point token because the part # before the decimal point will match the int token float_token = lambda : token_def(FLOAT , xform_t('float', float)) int_token = lambda : token_def(INT , xform_t('int', int)) number_token = lambda : token_def(NUMBER, number_t()) def parse_scan(qry, **kwargs): # Helper functions def mk_intrange(txt): return hvutil.expand_string_range(txt, rchar='-') # take a string and make a "^...$" regex out of it, # doing escaping of regex special chars and # transforming "*" into ".*" and "?" into "." # (basically shell regex => normal regex) def pattern2regex(s): s = reduce(lambda acc, x: re.sub(x, x, acc), [r"\+", r"\-", r"\."], s) s = reduce(lambda acc, t_r: re.sub(t_r[0], t_r[1], acc), [(r"\*+", r".*"), (r"\?", r".")], s) return re.compile(r"^"+s+"$") def regex2regex(s): flagmap = {"i": re.I, None:0} mo = re.match(r"(.)(?P<pattern>.+)\1(?P<flag>.)?", s) if not mo: raise RuntimeError("'{0}' does not match the regex pattern /.../i?".format(s)) return re.compile(mo.group('pattern'), flagmap[mo.group('flag')]) # basic lexical elements # These are the tokens for the tokenizer tokens = [ # keywords token_def(r"\b(to|not|where|limit)\b", keyword_t()), # can't use 'keyword_t()' for the next one because we may need to accept whitespace between "order" and "by" token_def(r"\b(asc|desc)\b", keyword_t()), token_def(r"\border\s+by\b", simple_t('order by')), token_def(r"\bin\b", operator_t('in')), token_def(r"\b(and|or|in)\b", operator_t('relop')), # Date + time formats datetime_token(YMD, TIME), datetime_token(YMD, HMS), datetime_token(DMY, TIME), datetime_token(DMY, HMS), datetime_token(DMY_EUR, TIME), datetime_token(DMY_EUR, HMS), # Relative day offset - note: assume that the global variable # 'start' is set correctly ... token_def(RELDAY+TIME, datetime_t(mk_seconds)), token_def(RELDAY+HMS, datetime_t(mk_seconds)), # Time durations token_def(r"(?P<neg>-)?(?P<d>\d+)d((?P<h>\d+)[hH])?((?P<m>\d+)[mM])?((?P<s>\d+(\.\d*)?)[sS])?", xformmg_t('duration', mk_seconds)), token_def(r"(?P<neg>-)?(?P<h>\d+)[hH]((?P<m>\d+)[mM])?((?P<s>\d+(\.\d*)?)[sS])?", xformmg_t('duration', mk_seconds)), token_def(r"(?P<neg>-)?(?P<m>\d+)[mM]((?P<s>\d+(\.\d*)?)[sS])?", xformmg_t('duration', mk_seconds)), token_def(r"(?P<neg>-)?(?P<s>\d+(\.\d*)?)[sS]", xformmg_t('duration', mk_seconds)), # regex token_def(r"/[^/]+/i?", xform_t('regex', regex2regex)), token_def(r"[0-9]+-[0-9]+(:[0-9]+)?", xform_t('irange', mk_intrange)), float_token(), int_token(), # Operators that just stand for themselves #token_def("~", simple_t('regexmatch')), token_def(r"(~|\blike\b)", xform_t('regexmatch', lambda o, **k: lambda x, y: not re.match(y, x) is None)), token_def(r"(<=|>=|=|<|>)", operator_t('compare')), token_def(r"-|\+|\*|/", operator_t('operator')), token_def(r"\(", simple_t('lparen')), token_def(r"\)", simple_t('rparen')), token_def(r"\[", simple_t('lbracket')), token_def(r"\]", simple_t('rbracket')), token_def(r",", simple_t('comma')), # Textual stuff token_def(r"\$(?P<sym>[a-zA-Z][a-zA-Z_]*)", resolve_t('external', 'sym')), token_def(r"'[^']*'", value_t('literal')), token_def(r"[:@\#%!\.a-zA-Z0-9_?|]+", value_t('text')), token_def(r"\s+", ignore_t()) ] tokenizer = mk_tokenizer(tokens, **kwargs) # The output of the parsing is a filter function that returns # True or False given a scan object #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value ###### Our grammar # query = modifier [ 'where' condition ['order by' sorting ] ['limit' int] ] eof # modifier = expr 'to' expr # expr = term '+' term | term '-' term | term '*' term | term '/' term | '(' expr ')' # term = duration | number | property | external # duration = \d+ 'd'[\d+ 'h'][\d+ 'm'] [\d+ ['.' \d* ] 's'] | # \d+ 'h'[\d+ 'm'] [\d+ ['.' \d* ] 's'] | # \d+ 'm' [\d+ ['.' \d* ] 's'] | # \d+ ['.' \d* ] 's' # number = int|float # property = alpha char {alpha char | digit | '_'} # will get property from scan object # external = '$' property # will look up value of property in global namespace # condition = condexpr {relop condexpr} | 'not' condexpr | '(' condexpr ')' # sorting = sortterm {',' sortterm} # sortterm = identifier ['asc','desc' ] # condexpr = property '~' (regex|text) | property compare expr | property 'in' list # compare = '=' | '>' | '>=' | '<' | '<=' ; # relop = 'and' | 'or' ; # list = '[' [value {',' value}] ']' # value = anychar {anychar} # regex = '/' {anychar - '/'} '/' ['i'] ('i' is the case-insensitive match flag) # identifier = alpha {character} # anychar = character | symbol # character = alpha | digit # alpha = [a-zA-Z_] ; # digit = [0-9] ; # query = expr 'to' expr [ 'where' condition ] eof def parse_query(s): if tok(s).type is None: raise SyntaxError("empty query") # parse the scan start/end time modification function first perscan_f = parse_modifier(s) # if the parse left off at the 'where' keyword, we know what to do # note: the 'where' clause is optional and defaults to "all scans" # WHERE where = tok(s) filter_f = parse_condition(next(s)) if where.type=='where' else lambda x: True # "ORDER BY" orderby = tok(s) if orderby.type=='order by': # only allow comma separated list of identifiers # 'parse_sort_list' will return a list of sort functions, in the order # in they were given sortlist = parse_sort_list(next(s)) if not sortlist: raise SyntaxError("No list of sort keys found (%s)" % tok(s)) # good, 'sortlist' is a list of sort functions that need to be applied # on the list of filtered items orderby.value = lambda x: reduce(lambda acc, sortfn: sortfn(acc), sortlist, x) else: # no sorting orderby.value = identity # "LIMIT" limit = tok(s) if limit.type=='limit': # we MUST be followed by an int next(s) ival = tok(s) if ival.type!='int': raise SyntaxError("Only an integer is allowed after limit, not %s" % ival) # consume the integer next(s) count = itertools.count() limit.value = lambda x: itertools.takewhile(lambda obj: next(count)<ival.value, x) else: limit.value = identity # the only token left should be 'eof' AND, after consuming it, # the stream should be empty. Anything else is a syntax error try: if tok(s).type is None: next(s) except StopIteration: return (perscan_f, compose(List, limit.value, orderby.value, Filter(filter_f))) raise SyntaxError("Tokens left after parsing %s" % tok(s)) # modifier = expr 'to' expr def parse_modifier(s): # we require two functions to be generated, the start_time_fn (before 'to') # and the end_time_fn (after 'to' ...) depth = s.depth start_time_fn = parse_expr(s) #start_time_fn = parse_expr(s, None) if s.depth!=depth: raise SyntaxError("Unbalanced parenthesis %s" % tok(s)) # we now MUST see the 'to' keyword to = tok(s) if to is None or to.type!='to': raise SyntaxError("Unexpected token %s (expected 'to' keyword)" % tok(s)) # do not forget to consume the 'to' keyword ... depth = s.depth end_time_fn = parse_expr(next(s)) if s.depth!=depth: raise SyntaxError("Unbalanced parentheses %s (expect depth %d, found %d)" % (tok(s), depth, s.depth)) def this_fn(scan): s_time = start_time_fn(scan) e_time = end_time_fn(scan) if e_time<s_time: raise RuntimeError("Scan time selection error: end time is before start time in scan\n {0}".format(scan)) return (s_time, e_time) return this_fn #return lambda scan: (start_time_fn(scan), end_time_fn(scan)) # expr = term | expr '+' expr | expr '-' expr | expr '*' expr | expr '/' expr | '(' expr ')' | '-' expr def parse_expr(s, unary=False): t = tok(s) depth = s.depth if t.type in ['lparen', 'rparen']: lterm = parse_paren(s) elif t.type=='operator' and t.value is mk_operator('-'): # unary '-' tmpexpr = parse_expr(next(s), unary=True) lterm = lambda scan: operator.neg( tmpexpr(scan) ) else: lterm = parse_term(s) # If we see an operator, we must parse the right-hand-side # (our argument is the left-hand-side # Well ... not if we're doing unary parsing! # if we saw unary '-' then we should parse parens and terms up until # the next operator oper = tok(s) if oper.type=='operator': if unary: return lterm if lterm is None: raise SyntaxError("No left-hand-side to operator %s" % oper) rterm = parse_expr(next(s)) if rterm is None: raise SyntaxError("No right-hand-side to operator %s" % oper) return lambda scan: oper.value(lterm(scan), rterm(scan)) elif oper.type in ['int', 'float', 'duration', 'datetime']: # negative numbers as right hand side are not negative numbers # but are operator '-'! # so, subtracting a number means adding the negative value (which we already # have god) # Consume the number and return the operator add next(s) return lambda scan: operator.add(lterm(scan), oper.value) # neither parens, terms, operators? return lterm def parse_paren(s): lparen = tok(s) if lparen.type!='lparen': raise RuntimeError("Entered parse_paren w/o left paren but %s" % lparen) depth = s.depth s.depth = s.depth + 1 # recurse into parsing the expression - and do NOT forget to consume the lparen! expr = parse_expr(next(s)) # now we should be back at the same depth AND we should be seeing rparen rparen = tok(s) if rparen.type=='rparen': s.depth = s.depth - 1 next(s) return expr # term = duration | number | property | external def parse_term(s): term = tok(s) # The easy bits first if term.type in ['int', 'float', 'external', 'duration', 'datetime', 'text']: if term.type=='text': def attrib_or_value(scan): if hasattr(scan, term.value): return getattr(scan, term.value) else: return term.value rv = attrib_or_value else: rv = lambda scan: term.value # all's well - eat this term next(s) return rv elif term.type=='literal': # ok, allowed to consume it next(s) # note that we strip the leading and closing single quote rv = lambda scan: term.value[1:-1] return rv return None # condition = condexpr {relop condexpr} | 'not' condition | '(' condition ')' # relop = 'and' | 'or' ; def parse_condition(s): token = tok(s) # Recurse if we need to if token.type in ['lparen', 'rparen']: lterm = parse_paren_condition(s) # 'not' expr elif token.type=='not': # parse the next expr and negate it # we MUST have a next one condition = parse_condition(next(s)) if condition is None: raise SyntaxError("Missing expression after 'not' %s" % condition) lterm = lambda scan: operator.not_( condition(scan) ) else: # it must be a condexpr lterm = parse_cond_expr(s) # If we now see a relop, we have to parse another condition relop = tok(s) if relop.type!='relop': return lterm # consume the relop & parse the condition rterm = parse_condition(next(s)) if lterm is None: raise SyntaxError("Missing left-hand-condition to relational operator (%s)", relop) if rterm is None: raise SyntaxError("Missing right-hand-condition to relational operator (%s)", relop) # and return the combined operation return lambda scan: relop.value(lterm(scan), rterm(scan)) # condexpr = expr '~' (regex|text) | expr compare expr | expr 'in' list # compare = '=' | '>' | '>=' | '<' | '<=' ; def parse_cond_expr(s): token = tok(s) # No matter what, we have a left and a right hand side # separated by an operator lterm = parse_expr(s) if lterm is None: raise SyntaxError("Failed to parse left-hand-term of cond_expr (%s)" % tok(s)) # Now we must see a comparator compare = tok(s) if not compare.type in ['compare', 'regexmatch', 'in']: raise SyntaxError("Expected a comparison operator, regex match or 'in' keyword, got %s" % compare) # consume the comparison next(s) # do some processing based on the type of operator if compare.type=='in': rterm = parse_list(s) elif compare.type=='compare': rterm = parse_expr(s) else: # must've been regexmatch rterm = parse_rx(s) # it better exist if rterm is None: raise SyntaxError("Failed to parse right-hand-term of cond_expr (%s)" % tok(s)) return lambda scan: compare.value(lterm(scan), rterm(scan)) def parse_paren_condition(s): lparen = tok(s) if lparen.type!='lparen': raise RuntimeError("Entered parse_paren_condition w/o left paren but %s" % lparen) depth = s.depth s.depth = s.depth + 1 # recurse into parsing the expression - and do NOT forget to consume the lparen! expr = parse_condition(next(s)) # now we should be back at the same depth AND we should be seeing rparen rparen = tok(s) if rparen.type=='rparen': s.depth = s.depth - 1 next(s) return expr def parse_rx(s): # we accept string, literal and regex and return an rx object rx = tok(s) if not rx.type in ['regex', 'text', 'literal']: raise SyntaxError("Failed to parse string matching regex (not regex, text or literal but %s)" % rx) # consume the token next(s) if rx.type=='literal': # extract the pattern from the literal (ie strip the leading/trailing "'" characters) rx.value = rx.value[1:-1] if rx.type in ['text', 'literal']: rx.value = pattern2regex(rx.value) return lambda scan: rx.value def parse_list(s): bracket = tok(s) if bracket.type != 'lbracket': raise SyntaxError("Expected list-open bracket ('[') but found %s" % bracket) rv = [] # keep eating text + ',' until we read 'rbracket' next(s) while tok(s).type!='rbracket': # if we end up here we KNOW we have a non-empty list because # the next token after '[' was NOT ']' # Thus if we need a comma, we could also be seeing ']' needcomma = len(rv)>0 #print " ... needcomma=",needcomma," current token=",tok(s) if needcomma: if tok(s).type=='rbracket': continue if tok(s).type!='comma': raise SyntaxError("Badly formed list at {0}".format(tok(s))) # and eat the comma next(s) # now we need a value. 'identifier' is also an acceptable blob of text rv.extend( parse_list_item(s) ) #print "parse_list: ",rv # and consume the rbracket (if not rbracket a syntax error is raised above) next(s) return lambda scan: rv # always returns a list-of-items; suppose the list item was an irange def parse_list_item(s): t = tok(s) # current token must be 'text' or 'irange' if not t.type in ['text', 'irange', 'int', 'float', 'literal']: raise SyntaxError("Failure to parse list-item {0}".format(t)) next(s) # for a literal, strip the leading and closing single quote if t.type == 'literal': t.value = t.value[1:-1] return t.value if t.type == 'irange' else [t.value] # attribute list = identifier {',' identifier} def parse_sort_list(s): rv = [] rxAttribute = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$") while True: item = tok(s) if item.type!='text': raise SyntaxError("attribute list may only contain strings, found %s" % item) if not rxAttribute.match(item.value): raise SyntaxError("%s is not a valid attribute name" % item) # Peek at the next token. If it's asc/desc take that into account next(s) order = tok(s) if order.type in ['asc', 'desc']: # consume it next(s) else: order.type = 'asc' # create a sorting function def mk_sf(attr, order): def do_it(x): return sorted(x, key=operator.attrgetter(attr), reverse=(order=='desc')) return do_it rv.append( mk_sf(item.value, order.type) ) #if we don't see a comma next, we break if tok(s).type!='comma': break # consume the comma next(s) # primary sort key is now first in list but for the sorting to work in steps # (see https://wiki.python.org/moin/HowTo/Sorting ) we must apply the sorting # functions in reverse order return reversed(rv) class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 next(self) def __next__(self): self.token = next(self.tokenstream) return self next = __next__ tokenizer = mk_tokenizer(tokens, **kwargs) return parse_query(state_type(tokenizer(qry))) # Time (range) grammar - we must support some arithmetic # # timerange = expr { 'to' ('+' duration | expr )} # expr = term '+' term | term '-' term | term '*' term | term '/' term | '(' expr ')' # term = number | identifier | datetime | duration | reltime # datetime = year '/' month '/' day [T/] timestamp | day '-' month '-' year [T/] timestamp | day '-' monthstr '-' year [T/] timestamp # reltime = {'-'} digit {digit} '/' timestamp # reltime = {'-'} digit {digit} '/' duration # year = 4 * digit # month = 2 * digit # monthstr = 3 * alpha char # day = 2 * digit # timestamp = digit {digit} [hH] digit {digit} [mM] digit {digit} {'.' digits } [sS]) | # digit {digit} ':' digit {digit} ':' digit {digit} {'.' digits} # duration = number [hH] { number [mM] { number ['.' number] [sS] } } | number [mM] { number ['.' number] [sS] } | number ['.' number] [sS] # scan prop = 'scan' digits '.' identifier # number = {'-'} {[0-9]+}{'.'}[0-9]+{[eE]{-}[0-9]+} # identifier = alpha char {alpha char | digit | '_' } # digits = digit {digit} # digit = [0-9] # alpha char = [a-zA-Z] SEC = r"(?P<s>\d+(\.\d*)?)[sS]" MIN = r"(?P<m>\d+)[mM]" HR = r"(?P<h>\d+)[hH]" DAY = r"(?P<d>\d+)d" DUR4 = DAY+MAYBE(HR)+MAYBE(MIN)+MAYBE(SEC) DUR3 = HR+MAYBE(MIN)+MAYBE(SEC) DUR2 = MIN+MAYBE(SEC) DUR1 = SEC def parse_time_expr(txt, **env): # Helper functions # basic lexical elements # These are the tokens for the tokenizer tokens = [ # keywords token_def(r"\bto\b", keyword_t()), # Date + time formats datetime_token(YMD, TIME), datetime_token(YMD, HMS), datetime_token(DMY, TIME), datetime_token(DMY, HMS), datetime_token(DMY_EUR, TIME), datetime_token(DMY_EUR, HMS), # Relative day offset - note: assume that the global variable # 'start' is set correctly ... token_def(RELDAY+TIME, datetime_t(mk_seconds)), token_def(RELDAY+HMS, datetime_t(mk_seconds)), # Time durations token_def(RELDAY+DUR3, datetime_t(mk_seconds)), token_def(RELDAY+DUR2, datetime_t(mk_seconds)), token_def(RELDAY+DUR1, datetime_t(mk_seconds)), token_def(DUR4, xformmg_t('duration', mk_seconds)), token_def(DUR3, xformmg_t('duration', mk_seconds)), token_def(DUR2, xformmg_t('duration', mk_seconds)), token_def(DUR1, xformmg_t('duration', mk_seconds)), # Operators and semantic elements that just stand for themselves token_def(r"-|\+|\*|/", operator_t('operator')), token_def(r"\(", simple_t('lparen')), token_def(r"\)", simple_t('rparen')), token_def(r',', simple_t('comma')), # we don't care about int's or float's - any number we'll accept number_token(), token_def(r"\$(?P<sym>[a-zA-Z][a-zA-Z_]*)", resolve_t('external', 'sym')), token_def(r"\s+", ignore_t()) ] # shorthands that work on the parser state 's' #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value # timeranges = timerange {',' timerange} # timerange = expr ['to' expr)] def parse_time_ranges(s): rv = [] while True: rv.append( parse_time_range(s) ) # Check for more time ranges nxt = tok(s) if nxt.type=='comma': # consume the comma and continue next(s) continue break # Ok, we should now see 'eof' and an empty stream try: if tok(s).type is None: next(s) except StopIteration: return rv raise SyntaxError("Tokens left after parsing %s" % tok(s)) # timerange = expr ['to' expr ] def parse_time_range(s): # we require two functions to be generated, the start_time_fn (before 'to') # and the end_time_fn (after 'to' ...) depth = s.depth start_time = parse_expr(s) if s.depth!=depth: raise SyntaxError("Unbalanced parenthesis %s" % tok(s)) if start_time is None: raise SyntaxError("Missing start-time expression %s" % tok(s)) # If we don't see the 'to' keyword, it's a single time stamp to = tok(s) if to.type!='to': return (start_time, start_time) # insert the current rterm value in the environment such that # the 'end_time' may use "+ duration" env['$#parsed:start^time#$'] = start_time # parse the end time after consuming the 'to' keyword depth = s.depth end_time = parse_expr(next(s)) # remove the value from the environment again del env['$#parsed:start^time#$'] if s.depth!=depth: raise SyntaxError("Unbalanced parentheses %s (expect depth %d, found %d)" % (tok(s), depth, s.depth)) if end_time is None: raise SyntaxError("Missing end-time expression %s" % tok(s)) if end_time<start_time: t = tok(s) raise RuntimeError("{0}\n{1:>{2}s}^\nend time is before start time here".format(txt, "", len(txt) if t.position<0 else t.position)) return (start_time, end_time) # expr = term | expr '+' expr | expr '-' expr | expr '*' expr | expr '/' expr | '(' expr ')' | '-' expr def parse_expr(s, unary=False): t = tok(s) depth = s.depth if t.type in ['lparen', 'rparen']: lterm = parse_paren(s) elif t.type=='operator' and t.value is mk_operator('-'): # unary '-' tmpexpr = parse_expr(next(s), unary=True) lterm = operator.neg( tmpexpr ) elif t.type=='operator' and t.value is mk_operator('+'): # unary '+' - may be followed by an expression; we add the parsed time # to whatever the start time was next(s) duration = parse_expr(s) return env['$#parsed:start^time#$'] + duration else: lterm = parse_term(s) # If we see an operator, we must parse the right-hand-side # (our argument is the left-hand-side # Well ... not if we're doing unary parsing! # if we saw unary '-' then we should parse parens and terms up until # the next operator oper = tok(s) if oper.type=='operator': if unary: return lterm if lterm is None: raise SyntaxError("No left-hand-side to operator %s" % oper) rterm = parse_expr(next(s)) if rterm is None: raise SyntaxError("No right-hand-side to operator %s" % oper) return oper.value(lterm, rterm) elif oper.type in ['int', 'float', 'duration', 'datetime']: # negative numbers as right hand side are not negative numbers # but are operator '-'! # so, subtracting a number means adding the negative value (which we already # have god) # Consume the number and return the operator add next(s) return operator.add(lterm, oper.value) # neither parens, terms, operators? return lterm def parse_paren(s): lparen = tok(s) if lparen.type!='lparen': raise RuntimeError("Entered parse_paren w/o left paren but %s" % lparen) depth = s.depth s.depth = s.depth + 1 # recurse into parsing the expression - and do NOT forget to consume the lparen! expr = parse_expr(next(s)) # now we should be back at the same depth AND we should be seeing rparen rparen = tok(s) if rparen.type=='rparen': s.depth = s.depth - 1 next(s) return expr # term = duration | number | external def parse_term(s): term = tok(s) # The easy bits first if term.type in ['number', 'external', 'duration', 'datetime']: # all's well - eat this term next(s) return term.value return None class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 next(self) def __next__(self): self.token = next(self.tokenstream) return self next = __next__ tokenizer = mk_tokenizer(tokens, **env) return parse_time_ranges(state_type(tokenizer(txt))) # Parse a simple duration (sort of VEX format): # ...y..d..h..m....s # there must be at least one unit present, trailing fields after the highest order unit are optional. # only accepts units in this order; e.g. cannot say 10s1h MINf = r"(?P<m>\d+(\.\d*)?)[mM]" HRf = r"(?P<h>\d+(\.\d*)?)[hH]" DAYf = r"(?P<d>\d+(\.\d*)?)d" DUR4f = DAYf+MAYBE(HRf)+MAYBE(MINf)+MAYBE(SEC) DUR3f = HRf+MAYBE(MINf)+MAYBE(SEC) DUR2f = MINf+MAYBE(SEC) def parse_duration(txt, **env): # Helper functions # basic lexical elements # These are the tokens for the tokenizer tokens = [ # Time durations token_def(DUR4f, xformmg_t('duration', mk_seconds)), token_def(DUR3f, xformmg_t('duration', mk_seconds)), token_def(DUR2f, xformmg_t('duration', mk_seconds)), token_def(DUR1, xformmg_t('duration', mk_seconds)), token_def(r"\S+", lambda o, **kwargs: token_type('gunk', o.group(0))) ] # shorthands that work on the parser state 's' #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value # duration = ..y..d..h..m..s def parse_time_duration(s): dur = tok(s) if dur.type!='duration': raise SyntaxError("This is not a duration - %s" % dur) next(s) try: if tok(s).type is None: next(s) except StopIteration: return dur.value raise SyntaxError("Tokens left after parsing %s" % tok(s)) class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 next(self) def __next__(self): self.token = next(self.tokenstream) return self next = __next__ tokenizer = mk_tokenizer(tokens, **env) return parse_time_duration(state_type(tokenizer(txt))) ######################################################################################################## # # data set expression parser # allows manipulation of results of plotiterator - e.g. differencing # ######################################################################################################### # <dscmd> = 'store' {<expr>} 'as' <id> | 'load' <expr> # <expr> = <expr> '+' <term> | <expr> '-' <term> | <term> # <term> = <term> '*' <factor> | <term> '/' <factor> | <factor> # <factor> = <exponent> '^' <factor> | <exponent> # <exponent> = '-' <exponent> | <final> # <final> = <number> | <id> | <id> '(' <arglist> ')' | '(' <expr> ')' | <id> '[' <subscript> ']' # <arglist> = <empty> | <expr> {',' <expr> } # <subscript> = <filter> {',' <filter> } # <filter> = <attribute> '=' <value> # <attribute> = 'p' | 'ch' | 'sb' | 'src' | 'time' | 'bl' | 'fq' # 'P' | 'CH' | 'SB' | 'SRC' | 'TIME' | 'BL' | 'FQ' # <value> = <int> | <alnum> # may/should/will depend on type of attribute! # # <number> = <int> | <float> # <int> = [0-9]+ # <float> = [0-9]+'.'[0-9]* | '.'[0-9]+ # <id> = <alnum>{'.'<alnum>} # <alnum> = [a-zA-Z_][a-zA-Z0-9_]* methodwrappert = type({}.__delitem__) def isAttr(o): # in fact, 'inspect.is<predicate>' predicates are useless. They still return all # the members of e.g. a "dict()". Because all of the methods of 'dict' are now # of type <method-wrapper>. Gah! #return not (inspect.isfunction(o) or inspect.ismethod(o)) return not (inspect.isbuiltin(o) or o is methodwrappert) def copy_attributes(outp, inp): drap(lambda a_v: setattr(outp, a_v[0], a_v[1]), \ map(lambda a_tp: (a_tp[0], getattr(inp, a_tp[0])), \ filter(lambda nm_tp: not nm_tp[0].startswith('__'), inspect.getmembers(inp, isAttr)))) return outp def ds_flat_filter(value, tp=None, subscript=None): rv = copy_attributes(plotutil.Dict(), value) (mklabf, subquery) = (identity, const(True)) if subscript is None else subscript for ds in (value.keys() if tp is None else filter(lambda k: k.TYPE==tp, value.keys())): # Do we accept this dataset? if not subquery(ds): continue # if anames is set, it means we've filtered/subindexed so we must create a new label with # the indicated anames set to None [such that the crossmatching on those won't fail] nds = mklabf(ds) rv[nds] = value[ds].sort() return rv def ds_key_filter(value, keys): rv = copy_attributes(plotutil.Dict(), value) for k in keys: rv[k] = value[k] return rv dictType = type({}) #isDataset = lambda x: isinstance(x, plotutil.plt_dataset) isDataset = lambda x: isinstance(x, dictType) def normal_apply(l, f, r): return (None, (l._xval, f(l._yval.data, r._yval.data))) def shortest_apply(l, f, r): n = min(len(l._yval.data), len(r._yval.data)) return ("Truncated to {0} elements".format(n), (l._xval.data[:n], f(l._yval.data[:n], r._yval.data[:n]))) isect_table = { # ( <lengths equal>, <one of 'm has length '1'>) (True, False): normal_apply, # when both lengths are equal it doesn't matter how long (True, True): normal_apply, # .. (False, True): normal_apply, # if unequal lengths but one of'm has is length '1' (False, False): shortest_apply # only apply to first 'n' elements } # args = (cond1(a), msg1(A)), (cond2(a), msg2(A)), ... # i.e. tuples with two elements: a function to test the result and a function to produce # the error message. Both functions get passed the full plot dataset # # Note: processing stops after the first test fails def maybe_warn(a, *args): for (cond, msg) in args: if cond(a): print(msg(a)) break return a def do_isect(d0, f, d1): # we know both d0 and d1 are flattened datasets # so we must iterate over the set of identical keys # for each key we apply the operation to the y-part of the datasets def app(acc, key): # make sure they're numarrays # [note: .as_numarray() does not create a new object, just returns 'self'] ds0 = d0[key] ds1 = d1[key] l0 = len(ds0._yval) l1 = len(ds1._yval) # compare lengths (...) and decide what to do (msg, res) = isect_table[(l0==l1, l0==1 or l1==1)](ds0, f, ds1) if msg is not None: print("{0}: {1}".format(key, msg)) # the new dataset has the combined flagged between the two participating datasets nOutput = len(res[0]) acc[ key ] = plotutil.plt_dataset(res[0], res[1], numpy.logical_or(ds0._m_flagged[:nOutput], ds1._m_flagged[:nOutput])) return acc nk0 = len(d0.keys()) nk1 = len(d1.keys()) return maybe_warn(reduce(app, set(d0.keys()) & set(d1.keys()), copy_attributes(plotutil.Dict(), d0)), (lambda a: not a, lambda _: "do_isect(d0, f, d1): no common data sets were found, d1 has {} keys, d2 has {}".format(nk0, nk1)), (lambda a: len(a.keys())!=nk0 or len(a.keys())!=nk1, lambda a: "do_isect(d0, f, d1): only {} common keys between d1 and d2 ({} and {} keys)".format(len(a.keys()), nk0, nk1)) ) # implement infix operator 'f' on two datums def immediate_apply(l, f, r): return f(l, r) def do_iterate(d0, f, d1): # we know that either d0 or d1 is a dataset proto = d0 if isDataset(d0) else d1 # from the prototype dataset # apply in the correct order! app = (lambda d: f(d, d1)) if isDataset(d0) else (lambda d: f(d0, d)) def reductor(acc, k_ds): (key, ds) = k_ds acc[ key ] = plotutil.plt_dataset(ds._xval.data, app(ds._yval.data), ds._yval.mask) return acc return reduce(reductor, iteritems(proto), copy_attributes(plotutil.Dict(), proto)) applicator_table = { # infix: lhs <operator> rhs # table key is: # ( <isDataset lhs>, <isDataset rhs> ) (False, False): immediate_apply , # short-circuit direct evaluation of non-datasets (True, False): do_iterate, # one of'm is a dataset (False, True): do_iterate, # id. (True, True): do_isect # both are datasets, must intersect } def applicator(d0, f, d1): # for both arguments we want a set of keys such that we can get the intersection # of identical keys. But that's only if both of 'm are datasets # otherwise it's either just numbers that are combined or one of them is a data set return applicator_table[(isDataset(d0), isDataset(d1))](d0, f, d1) def mk_dataset(ds, an): if not isDataset(ds): return ds if an is not None: ds.msname = copy.deepcopy(an) return ds def parse_dataset_expr(txt, datasets, **env): ident = r"[a-zA-Z_][a-zA-Z0-9_]*" identifier = NAMED("name", ident)+MAYBE(r"\."+NAMED("type", ident)) unary_minus = mk_operator('-') # extract the expression annotation = hvutil.sub(txt, [(r"^\s*load\s*", ""), (r"^\s*store\s*", ""), (r"\bas\b.*", "")]).strip() # basic lexical elements # These are the tokens for the tokenizer tokens = [ # keywords token_def(r"\b(store|load|as)\b", keyword_t()), # the attribute names #(re.compile(r"\b(p|ch|sb|fq|bl|time|src)\b", re.I), value_t('attrname')), (re.compile(r"\b(p|ch|sb|bl|src)\b", re.I), value_t('attrname')), # operators token_def(r"-|\+", operator_t('additive')), token_def(r"\*|/", operator_t('multiplicative')), token_def(r"\^", operator_t('exponent')), token_def(r",", simple_t('comma')), token_def(r"=", simple_t('equal')), token_def(r"\(", simple_t('lparen')), token_def(r"\)", simple_t('rparen')), token_def(r"\[", simple_t('lbracket')), token_def(r"\]", simple_t('rbracket')), # identifiers (function call) and variables: <name>{.<type>} or <name>{.<type>} token_def(identifier, xformmg_t('id', lambda mo: mo.groupdict())), token_def(ident, value_t('text')), #token_def(r"\$"+dsid, value_t('dsid')), # numbers number_token(), # not particularly interested in whitespace token_def(r"\s+", ignore_t()) # Time durations # token_def(DUR4f, xformmg_t('duration', mk_seconds)), # token_def(DUR3f, xformmg_t('duration', mk_seconds)), # token_def(DUR2f, xformmg_t('duration', mk_seconds)), # token_def(DUR1, xformmg_t('duration', mk_seconds)), # token_def(r"\S+", lambda o, **kwargs: token_type('gunk', o.group(0))) ] # shorthands that work on the parser state 's' #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value # def parse_dataset_expr_impl(s): # try: # while True: # t = tok(s) # print "[{0}/{1}] ".format(t.type, t.value) # if t.type is None: # break # next(s) # next(s) # raise SyntaxError("Tokens left after 'None' [tp={0}]".format(tok(s).type)) # except StopIteration: # pass # #raise RuntimeError,"Not implemented yet" # entry point of grammar def parse_dataset_expr_impl(s): # only 'load' or 'store' are supported at this point supported = { 'load': parse_load_dataset, 'store': parse_store_dataset } cur = tok(s) if cur.type not in supported: raise SyntaxError("Unexpected token {0} in stead of {1}".format(cur.type, list(supported.keys()))) else: # eat up the token and dive in rv = supported[cur.type]( next(s) ) # After parsing we should see EOF and then nothing try: next(s) if tok(s).type is None: next(s) else: raise SyntaxError("Trailing token(s) {0}".format( tok(s).value )) except StopIteration: # fine pass return rv def parse_store_dataset(s): # 'store' {expr} 'as' <id> # 'store' <expr> {'as' <id>} # look at the next token, it could be either # an expression or the 'as' keyword, to indicate # that the current plots need to be stored cur = tok( s ) expr = None if cur.type != 'as': # we expect an expression here expr = parse_expr( s ) if s.depth!=0: raise SyntaxError("Unbalanced parenthesis") cur = tok( s ) name = None if cur.type == 'as': # eat up the 'as' keyword and expect an id next(s) cur = parse_id( s ) if cur.type != 'id': raise SyntaxError("Unexpected token {0} [expected variable name]".format( cur.value )) # verify that dsid does not address a subset $id.id but just $id if cur.value['type'] is not None: raise SyntaxError("Cannot store an expression as subset {0}".format( cur.value['type'] )) if 'filter' in cur.value: raise SyntaxError("Cannot store an expression as filtered subset of {0}".format( cur.value['name'] )) name = cur.value['name'] elif cur.type is not None: raise SyntaxError("Unexpected token {0} [expected 'as' or nothing]".format( cur.value )) # If both name and expr as None, that is a syntax error if expr is None and name is None: raise SyntaxError("Empty store command?!") # if either is None, we can provide defaults for that def mk_f(expression, nm): def do_it(ds): an = annotation expr = expression if expr is None: expr = lambda ds: ds_flat_filter(ds['_']) an = None else: if re.match(r"^\$[a-zA-Z_\.]+$", an): an = None n = nm if n is None: n = '_' # decorate with annotation if necessary ds[n] = mk_dataset(expr(ds), an) return ds[n] return do_it return mk_f(expr, name) def parse_load_dataset(s): # 'load' expr def mk_f(expression): def do_it(ds): an = annotation if re.match(r"^\$[a-zA-Z_\.]+$", an): an = None # decorate with annotation, if necessary ds['_'] = mk_dataset(expression(ds), an) return ds['_'] return do_it expr = parse_expr( s ) if s.depth!=0: raise SyntaxError("Unbalanced parenthesis") return mk_f( expr ) #@argprint # <name>{.<type>}{'[' <filter> ']'} def parse_id(s): # we should *at least* be looking at an 'id' token cur = tok( s ) if cur.type != 'id': raise SyntaxError("Expected an identifier here") # look ahead to see if we find '[' which would indicate subscripting/filtering next(s) if tok(s).type == 'lbracket': cur.value['filter'] = parse_filter( s ) return cur #@argprint # '[' <filter> ']' def parse_filter(s): # check if we indeed are looking at start of filter/subscripting and if so eat up that token if tok(s).type != 'lbracket': raise RuntimeError("Entered parse_filter() but not looking at '['?") next(s) # now we must see a comma separated list of <attr> = <value> rv = [] while tok(s).type!='rbracket': # if we end up here we KNOW we have a non-empty list because # the next token after '[' was NOT ']' # Thus if we need a comma, we could also be seeing ']' needcomma = len(rv)>0 if needcomma: if tok(s).type=='rbracket': continue if tok(s).type!='comma': raise SyntaxError("Badly formed list at {0}".format(tok(s))) # and eat the comma next(s) # now we need a list item "<attr> = <value>" rv.append( parse_list_item(s) ) # and consume the rbracket (if not rbracket a syntax error is raised above) next(s) # convert the list of attribute matchers into a single match fn def mk_match_f(l): def do_it(ds): return all([cond(ds) for cond in l]) return do_it def mk_lab_f(l): # label._attrs is the set of all attributes # l is the list/set of attributes that we need to replace with None # So what we do is construct the new label from the old one # but tell it to only take the values not in l! # (values not in the list of attributes to copy will be initialized to None by # the new label class) attrs_to_copy = plotutil.label._attrs - set(l) def do_it(ds): return plotutil.label(ds, attrs_to_copy) return do_it (anames, matchfns) = zip_(*rv) return (mk_lab_f(anames), mk_match_f(matchfns)) #@argprint # <attribute> '=' <value> # returns function which matches label attribute value to <value> def parse_list_item(s): attrtype_dict = { 'P':str, 'CH':int, 'SB':int, 'BL':re.compile, 'SRC':re.compile } attrmatch_dict = { 'P':operator.eq, 'CH':operator.eq, 'SB':operator.eq, 'BL':re.match, 'SRC':re.match } attrname = tok(s) if attrname.type!='attrname': raise SyntaxError("Expected attribute name but found {0}".format( attrname.type )) attrname = attrname.value.upper() # must see '=' if tok(next(s)).type!='equal': raise SyntaxError("Expected '=' but found {0}".format( tok(s).type )) # next we should see <int> or <alnum> attrval = tok( next(s) ) if attrval.type not in ['id', 'number', 'text']: raise SyntaxError("Expected a number or text but found {0}".format( attrval.type )) # ok, eat that one up next(s) def mk_amatch_f(aname, aval): # convert once aval = attrtype_dict[aname](aval) def do_it(ds): return attrmatch_dict[aname](aval, getattr(ds, aname)) return do_it return (attrname, mk_amatch_f(attrname, attrval.value if attrval.type in ['number','text'] else attrval.value['name'])) #@argprint def parse_expr( s ): # <expr> '+' <term> | <expr> '-' <term> | <term> expr = parse_term( s ) if expr is None: return None # If we're looking at an additive operator now, # we must parse the rhs cur = tok( s ) if cur.type=='additive': # eat this token next( s ) rhs = parse_expr( s ) if rhs is None: raise SyntaxError("Expected a term, got {0}".format( tok(s).value )) def mk_f(l, o, r): def do_it(ds): return applicator(l(ds), o, r(ds)) return do_it expr = mk_f(expr, cur.value, rhs) return expr #@argprint def parse_term( s ): # <term> = <term> '*' <factor> | <term> '/' <factor> | <factor> term = parse_factor( s ) # if no lhs (yet) we must check for a factor if term is None: return None # Now we could be looking a multiplicative operator cur = tok( s ) if cur.type=='multiplicative': # eat it and try to parse a factor next( s ) rhs = parse_term( s ) # oh noes! if rhs is None: raise SyntaxError("Expected a factor, got {0}".format( tok(s).value )) def mk_f(l, o, r): def do_it(ds): return applicator(l(ds), o, r(ds)) return do_it term = mk_f(term, cur.value, rhs) return term #@argprint def parse_factor(s): # <factor> = <exponent> '^' <factor> | <exponent> # if no exponent yet, look for one exponent = parse_exponent(s) if exponent is None: return None # if we're looking at '^', we must parse a factor cur = tok( s ) if cur.type=='exponent': # eat the token and try to parse another exponent next( s ) factor = parse_factor(s) if factor is None: raise SyntaxError("Expected an exponent, got {0}".format( tok(s).value )) def mk_f(e, o, f): def do_it(ds): return applicator(e(ds), o, f(ds)) return do_it exponent = mk_f(exponent, cur.value, factor) return exponent #@argprint def parse_exponent(s): # <exponent> = '-' <exponent> | <final> cur = tok( s ) if cur.type=='additive' and cur.value==unary_minus: # eat the minus sign next( s ) exponent = parse_exponent(s) def mk_f(e): def do_it(ds): return applicator(-1, operator.mul, e(ds)) return do_it exponent = mk_f(exponent) else: # must be a final then? exponent = parse_final(s) return exponent #@argprint def parse_final(s): # <final> = <number> | <id> | <id> '(' <expr> ')' | '(' <expr> ')' # look at current token final = tok( s ) # Check if we recognize it if final.type=='number': # ok, we know we recognize the token, let's eat the number next( s ) def mk_f(v): def do_it(ds): return v return do_it return mk_f(final.value) elif final.type=='id': # ok, we know we recognize the token, let's eat it up [this was the 'id'] final = parse_id( s ) # now, before returning something, check the new current token # if it happens to be 'lparen', we're looking at a functioncall! if tok(s).type=='lparen': # if the id had a filter, then it cannot be a functioncall! # aap.phase[p=ll] (...) should not parse # mod.fn (...) might be ok: python "module.function" in stead of "variable.type" if 'filter' in final.value: raise SyntaxError("A subscripted variable cannot be used as function call?!") # eat the paren, then parse the argument list next( s ) arglist = parse_arglist(s, []) # after parsing the arglist we MUST see 'rparen' orelse the user's a fool if tok(s).type!='rparen': raise SyntaxError("Expected ')' after functioncall, got {0}".format( tok(s).value )) # eat the rparen next( s ) # and return the useful bits def mk_f(fn, al): def do_it(ds): # lookup fn #callable_obj = do_resolve(fn) print("{0}({1})".format(fn, ",".join([repr(x(ds)) for x in al]))) #return apply_fn(callable_obj, ds, al) return 42 return do_it return mk_f(final.value, arglist) else: # no functioncall, just variable addressment def mk_f(nm): def do_it(ds): nam = nm['name'] typ = nm['type'] if nam not in ds: raise RuntimeError("Variable '{0}' does not exist".format(nam)) if isDataset(ds[nam]): return ds_flat_filter(ds[nam], typ, nm.get('filter', None)) else: return ds[nam][typ] if typ is not None else ds[nam] return do_it return mk_f(final.value) elif final.type=='lparen': # ah. parenthesis! eat up the '(' next( s ) s.depth = s.depth + 1 expr = parse_expr(s) # now we should see ')' cur = tok( s ) if cur.type!='rparen': raise SyntaxError("Expected ')' but got {0}".format( cur.value )) s.depth = s.depth - 1 next( s ) return expr else: # we don't actually recognize this token here ? return None #@argprint def parse_arglist(s, al): # <arglist> = <empty> | <list> # <list> = <expr> {',' <expr> } # Basically we're done if we see 'rparen' # don't eat the token because the upper level needs to see it to # make sure that parens are balanced cur = tok( s ) if cur.type=='rparen': return al return parse_list(s, al) #@argprint def parse_list(s, al): # we MUST see an expression now arg = parse_expr(s) if arg is None: raise SyntaxError("Empty argument is not allowed (current token={0})".format( tok(s).value )) # append it to the argument list al.append( arg ) # inspect current token; if it's a comma we recurse if tok(s).type=='comma': next(s) return parse_list(s, al) # if we don't see a comma, we return and let the upper levels decide wether the current token is a nice one. return al class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 next(self) def __next__(self): self.token = next(self.tokenstream) return self next = __next__ def __str__(self): return "<{0}/{1}>".format(self.depth, self.token) tokenizer = mk_tokenizer(tokens, **env) return parse_dataset_expr_impl(state_type(tokenizer(txt)))(datasets) ######################################################################################################## # # colorkey expression parser # allows for assigning specific color indices to data sets, based on attribute value(s) # ######################################################################################################### # The idea is that the user can type an expression: # # > ckey P[LL]=1 P[RR]=2 # # Such that all data sets where the 'P' (olarization) attribute has the value 'LL' get color index '1' # and those with 'RR' get color index '2'. # # More detailed selections are possible: # # > ckey SB[0], BL[/wb*/]=1 # # without constraints it does 'iota' = automatic counting # > ckey P[RR],SB # # When a label is passed that doesn't match any of the criteria an exception is thrown # # Grammar: # # expr = selector {' ' selector} {default} EOF # selector = attribs {'=' colorkey} # default = 'default' '=' colorkey # attribs = attrib {',' attrib} # attrib = attrname {'[' attrvallist ']'} # attrname = 'P' | 'CH' | 'SB' | 'FQ' | 'BL' | 'TIME' | 'SRC' | # 'p' | 'ch' | 'sb' | 'fq' | 'bl' | 'time' | 'src' # attrvallist = attrval {',' attrvallist } # attrval = number | string | regex | 'None' # number = [0-9]+ # string = ''' text ''' # colorkey = number # regex = '/' text '/' # text = all characters except the termination (http://stackoverflow.com/a/5455705/26083) # # The regex from http://stackoverflow.com/a/5455705/26083: # (this one looks for single-quote quoted strings but can easily be modified to support # other delimiting characters) # re.compile( r"""(?<!\\)(?:\\\\)*'([^'\\]*(?:\\.[^'\\]*)*)'""", re.DOTALL ) def mk_escaped_rx(ch, suf=None): return re.compile( r"""(?<!\\)(?:\\\\)*{0}([^{0}\\]*(?:\\.[^{0}\\]*)*){0}{1}""".format(ch, "" if suf is None else suf), re.DOTALL ) def mk_regex(rx): flagmap = {'i': re.I } flag = 0 # strip flag characters while rx[-1]!=rx[0]: flag |= flagmap.get(rx[-1], 0) rx = rx[:-1] return re.compile(rx[1:-1], flag) def parse_ckey_expr(expr): # our tokens tokens = [ # we have two keywords: 'None' and 'default' (re.compile(r"\bnone\b", re.I), simple_t('None')), (re.compile(r"\bdefault\b", re.I), simple_t('default')), # the attribute names we support (re.compile(r"\b(p|ch|sb|fq|bl|time|src)\b", re.I), xform_t('attrname', str.upper)), # attribute values # @regex and text: we get the terminating start, end characters as well so must strip them off (mk_escaped_rx('/', "i?"), xform_t('regex', mk_regex)), number_token(), # '=', '[', ']' and ',' token_def(r"\[", simple_t('lbracket')), token_def(r"\]", simple_t('rbracket')), token_def(r'=', simple_t('equal')), token_def(r',', simple_t('comma')), (mk_escaped_rx("'"), xform_t('text', lambda v: v[1:-1])), token_def(r"[^][ '\t,=]+", value_t('text')), token_def(r"\s+", ignore_t()) #token_def(r"[a-zA-Z0-9\+\-]" ] # shorthands that work on the parser state 's' #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value # ---- implementation def parse_ckey_expr_impl(s): # the wrapper function that generates the color index for a given label def mk_ckey_fn(lst): def do_it(label, keycoldict, **opts): # run the label through all the filters and see if something sticks #print "parse_ckey_expr:ckey_fn label={0}".format( str(label) ) cks = filter_(lambda x: x is not None, [ck(label, keycoldict) for ck in lst]) # if no color for the label ... that's a bad thing isn't it?! if not cks: raise RuntimeError("None of the colour filters matched label {0}".format( label )) #print "parse_ckey_expr:ckey_fn => found a match: colour = {0} [{1}]".format( cks[0], cks ) return cks[0] return do_it selectors = [] # a valid ckey expr is a sequence of valid selector assignments while True: selectors.append( parse_selector(s) ) # After having succesfully parsed a selector, # we should see EOF or another selector or default if tok(s).type in [None, 'default']: break # parse default if found if tok(s).type == 'default': selectors.append( parse_default(s) ) # the only token left should be 'eof' AND, after consuming it, # the stream should be empty. Anything else is a syntax error try: if tok(s).type is None: next(s) except StopIteration: return mk_ckey_fn(selectors) raise SyntaxError("Tokens left after parsing %s" % tok(s)) def parse_selector(s): def mk_cond(attrnm, attrlist, vallist): # list of values, turn into one string valstr = ",".join(map(str, vallist)) if vallist else "" def do_it(label): aval = getattr(label, attrnm) #print "parse_selector:do_it({0}) - attrnm[{1}] => {2}".format(str(label), attrnm, aval) # 'attrlist' is a list of functions that we'll pass the attribute value to see if it # matches the condition(s). Return true if at least one matches #if (aval is None) or (attrlist and [pred(aval) for pred in attrlist].count(True)==0): if attrlist and [pred(aval) for pred in attrlist].count(True)==0: return (None, None) if vallist: return (attrnm, valstr) if aval is None: # The current attribute is stripped and there was no explicit match for it so # there's not much we can do; maybe there's a default at the end but we don't know that here return (None, None) #raise RuntimeError("Your selector {0} seems based on a stripped attribute (==None)".format(attrnm)) return (attrnm, aval) return do_it # we have any number of "attrval {'[' ... ']'}" before the '=' condlist = [] while True: # a selector starts with an attrname attrname = tok(s) if attrname.type!='attrname': raise SyntaxError("Unexpected token '{0}', expected attribute name".format(attrname)) # safe to consume next(s) # if we see a '[' we must parse an attrivallist t = tok(s) alist = [] vlist = [] if t.type=='lbracket': (alist, vlist) = parse_attrvallist(s) # excellent! add another condition to the list of conditions condlist.append( mk_cond(attrname.value.upper(), alist, vlist) ) # Valid: either ',' (another "attrval"), "=", 'EOF' or another # selector "p sb" (two selectors) is different from "p,sb" (one selector) # note: could also be end-of-input t = tok(s) if t.type not in ['equal', 'comma', 'attrname', 'default', None]: raise SyntaxError("Unexpected token '{0}', expected '=', ',', 'default' or an attribute name".format( t )) # Ok, let's see what to do now if t.type=='comma': # consume the comma and continue: we should see another attrname! next(s) continue # all other valid tokens cause a break break # Ok, we may see an '=' sign, another attributename, default, or EOF equal = tok(s) def mk_colidxfn(): def do_it(keycoldict, key): # the default colour key function: if key not in keycoldict yet, # insert new one with new colour ck = keycoldict.get(key, None) if ck is None: # find values in the keycoldict and choose one that isn't there already colours = sorted([v for (k,v) in iteritems(keycoldict)]) # find first unused colour index, skip colour 0 and 1 (black & white) ck = 2 while colours and ck<=colours[-1]: if ck not in colours: break ck = ck + 1 keycoldict[key] = ck #print "parse_selector:colidxfn[default]: key={0} => colour={1}".format(key, ck) return ck return do_it colidxfn = mk_colidxfn() #colidxfn = lambda keycoldict, key: keycoldict.setdefault(key, len(keycoldict)) if equal.type=='equal': # must see an integer next(s) cval = tok(s) if cval.type!='number': raise SyntaxError("Unexpected token '{0}', expected a colour index (integer)") def mk_colidxfn(cv): def do_it(keycoldict, key): #print "parse_selector:colidxfn[constval]=",cv if keycoldict.setdefault(key, cv)!=cv: raise RuntimeError("Key {0} was already present with other colour index {1} [request to set to {2}]".format(key, keycoldict.get(key), cv)) return cv return do_it #return lambda keycoldict, key: cv colidxfn = mk_colidxfn(cval.value) # ok, eat the integer next(s) # from the conditionlist and colorvalue, we may construct the # colorselection function def mk_colselect(conds, colouridxfn): # return a function that, given a label and a colorkey dict, # returns either None or the color index for the label def do_it(l, keycoldict): #print "parse_selector:colorselector[{0} conds] for label {1}".format(len(conds), str(l)) # run all our conditions on the label; they ALL must match ms = map_(ylppa(l), conds) # if any of them are (None, None) - this label doesn't meet # all our criteria #print "parse_selector:colorselector[{0} conds] ms={1}".format(len(conds), ms) if (None, None) in ms: return None # create new label with all attributes # taken from the conditions nl = plotutil.label( dict(ms), map(GetN(0), ms) ) # str representation is key return colouridxfn(keycoldict, str(nl)) return do_it return mk_colselect(condlist, colidxfn) # assert that we see 'default' '=' <colorkey> # and return a function that yields that <colorkey> def parse_default(s): # check current token for equivalence to 'default' and consume it cur = tok(s) if cur.type != 'default': raise SyntaxError("Unexpected token '{0}', expect 'default'".format(cur)) next(s) # now we must see '=' cur = tok(s) if cur.type != 'equal': raise SyntaxError("Unexpected token '{0}', expect '='".format(cur)) next(s) # finally, we need to see a number; the default color cur = tok(s) if cur.type != 'number': raise SyntaxError("Unexpected token '{0}', expect a number!".format(cur)) ival = copy.deepcopy( cur.value ) # and eat the token next( s ) def do_it(l, keycoldict): return ival return do_it # parse an attribute-value list! # return a tuple of list of match functions and a list of values (the strings) # for later display purposes def parse_attrvallist(s): # we should see '[' lbrack = tok(s) if lbrack.type!='lbracket': raise SyntaxError("Unexpected token '{0}', expect '['".format(lbrack)) # eat the '[' next(s) valfnlist = [] valvallist = [] # now we should be eating comma-separated entries until we see ']' musthaveitem = False while True: cur = tok(s) # only accept the closing ']' if we do not expect another item if cur.type=='rbracket': if musthaveitem: raise SyntaxError("Missing item after ',' in list at {0}".format(cur.pos-1)) # consume the ']' next(s) break # not close of list so must see at least a supported item if cur.type == 'number': # equality-compare def mk_comp(v): def do_it(aval): #print "parse_attrvallist:attribute-value comparator: {0}=={1}?".format(aval, v) return aval==v if aval is not None else False return do_it valfnlist.append( mk_comp(cur.value) ) elif cur.type == 'text': # case insensitive equality-compare def mk_comp_i(v): def do_it(aval): #print "parse_attrvallist:attribute-value case insensitive text compare: {0}=={1}?".format(aval, v) return aval.lower()==v if aval is not None else False return do_it valfnlist.append( mk_comp_i(cur.value.lower()) ) elif cur.type == 'regex': # run the regex against the attribute value def mk_reg_exec(rg): def do_it(aval): #print "parse_attrvallist:attribute-value regex matcher: {0} matches {1}?".format(aval, rg.pattern) return rg.search(aval) is not None if aval is not None else False return do_it valfnlist.append( mk_reg_exec(cur.value) ) elif cur.type == 'None': # the attribute value is None valfnlist.append( lambda aval: aval is None ) else: # unsupported type! raise SyntaxError("Unsupported list item '{0}', expected number, text or regex".format(cur)) # append the actual value to the list valvallist.append( cur.value ) # eat the item next(s) cur = tok(s) if cur.type=='comma': # eat the comma and indicate that we NEED an extra item musthaveitem = True next(s) else: musthaveitem = False return (valfnlist, valvallist) ## debuggert - just show all the tokens def parse_ckey_expr_impl_tokens(s): while True: t = tok(s) print(t) if t.type is None: break next(s) return 42 class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 next(self) def __next__(self): self.token = next(self.tokenstream) return self next = __next__ def __str__(self): return "<{0}/{1}>".format(self.depth, self.token) tokenizer = mk_tokenizer(tokens, **{}) return parse_ckey_expr_impl(state_type(tokenizer(expr))) ######################################################################################################### # # filter parser # # Let the user filter datasets to plot based on dataset attribute value(s) # *after* they've been read from disk but before they're plotted. # # This can be useful if data reading takes a lot of time and then be able # to (re)plot subset(s) of that data without having to re-read the raw data # from disk # ######################################################################################################### ###### Our grammar # filter = condition eof # condition = condexpr {relop condition} | 'not' condition | '(' condition ')' # #condexpr = attribute '~' (regex|text) | attribute compare expr | attribute 'in' list # condexpr = attribute '~' (regex|text) | attribute compare number | attribute 'in' list # attribute = 'P' | 'CH' | 'SB' | 'FQ' | 'BL' | 'TIME' | 'SRC' | # 'p' | 'ch' | 'sb' | 'fq' | 'bl' | 'time' | 'src' # # compare = '=' | '>' | '>=' | '<' | '<=' ; # relop = 'and' | 'or' ; # list = '[' [value {',' value}] ']' # value = number | text # regex = '/' {anychar - '/'} '/' ['i'] ('i' is the case-insensitive match flag) # number = digit { number } # digit = [0-9] # text = char { alpha } # char = [a-zA-Z_] # alpha = char | digit { alpha } # mk_attribute_getter = lambda a: lambda obj: getattr(obj, a.upper()) def parse_filter_expr(qry, **kwargs): # Helper functions def mk_intrange(txt): return hvutil.expand_string_range(txt, rchar='-') # take a string and make a "^...$" regex out of it, # doing escaping of regex special chars and # transforming "*" into ".*" and "?" into "." # (basically shell regex => normal regex) def pattern2regex(s): s = reduce(lambda acc, x: re.sub(x, x, acc), [r"\+", r"\-", r"\."], s) s = reduce(lambda acc, t_r: re.sub(t_r[0], t_r[1], acc), [(r"\*+", r".*"), (r"\?", r".")], s) return re.compile(r"^"+s+"$") def regex2regex(s): flagmap = {"i": re.I, None:0} mo = re.match(r"(.)(?P<pattern>.+)\1(?P<flag>.)?", s) if not mo: raise RuntimeError("'{0}' does not match the regex pattern /.../i?".format(s)) return re.compile(mo.group('pattern'), flagmap[mo.group('flag')]) # basic lexical elements # These are the tokens for the tokenizer tokens = [ # the attribute names we support #(re.compile(r"\b(p|ch|sb|fq|bl|time|src)\b", re.I), value_t('attribute')), (re.compile(r"\b(p|ch|sb|fq|bl|time|src)\b", re.I), xform_t('attribute', mk_attribute_getter)), # operators token_def(r"\bnot\b", operator_t('not')), token_def(r"\bin\b", operator_t('in')), token_def(r"\b(and|or)\b", operator_t('relop')), token_def(r"(<=|>=|=|<|>)", operator_t('compare')), token_def(r"(~|\blike\b)", xform_t('regexmatch', lambda o, **k: lambda x, y: re.match(y, x) is not None)), # parens + list stuff token_def(r"\(", simple_t('lparen')), token_def(r"\)", simple_t('rparen')), token_def(r"\[", simple_t('lbracket')), token_def(r"\]", simple_t('rbracket')), token_def(r",", simple_t('comma')), # values + regex int_token(), token_def(r"/[^/]+/i?\b", xform_t('regex', regex2regex)), token_def(r"[:@\#%!\.a-zA-Z0-9_?|]+", value_t('text')), # and whitespace token_def(r"\s+", ignore_t()) ] tokenizer = mk_tokenizer(tokens, **kwargs) # The output of the parsing is a filter function that returns # True or False given a dataset object #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value ###### Our grammar # filter = condition eof def parse_filter(s): if tok(s).type is None: raise SyntaxError("empty filter") filter_f = parse_condition(s) # "LIMIT" #limit = tok(s) #if limit.type=='limit': # # we MUST be followed by an int # next(s) # ival = tok(s) # if ival.type!='int': ## raise SyntaxError("Only an integer is allowed after limit, not %s" % ival) # # consume the integer # next(s) # count = itertools.count() # limit.value = lambda x: itertools.takewhile(lambda obj: count.next()<ival.value, x) #else: # limit.value = lambda x: x # the only token left should be 'eof' AND, after consuming it, # the stream should be empty. Anything else is a syntax error try: if tok(s).type is None: next(s) except StopIteration: return filter_f raise SyntaxError("Tokens left after parsing %s" % tok(s)) def parse_paren(s): lparen = tok(s) if lparen.type!='lparen': raise RuntimeError("Entered parse_paren w/o left paren but %s" % lparen) depth = s.depth s.depth = s.depth + 1 # recurse into parsing the expression - and do NOT forget to consume the lparen! expr = parse_expr(next(s)) # now we should be back at the same depth AND we should be seeing rparen rparen = tok(s) if rparen.type=='rparen': s.depth = s.depth - 1 next(s) return expr # condition = condexpr {relop condition} | 'not' condition | '(' condition ')' # relop = 'and' | 'or' ; def parse_condition(s): token = tok(s) # Recurse if we need to if token.type in ['lparen', 'rparen']: lterm = parse_paren_condition(s) # 'not' expr elif token.type=='not': # parse the next expr and negate it # we MUST have a next one condition = parse_condition(next(s)) if condition is None: raise SyntaxError("Missing expression after 'not' %s" % condition) lterm = lambda scan: operator.not_( condition(scan) ) else: # it must be a condexpr lterm = parse_cond_expr(s) # If we now see a relop, we have to parse another condition relop = tok(s) if relop.type!='relop': return lterm # consume the relop & parse the condition rterm = parse_condition(next(s)) if lterm is None: raise SyntaxError("Missing left-hand-condition to relational operator (%s)", relop) if rterm is None: raise SyntaxError("Missing right-hand-condition to relational operator (%s)", relop) # and return the combined operation return lambda scan: relop.value(lterm(scan), rterm(scan)) # condexpr = attribute '~' (regex|text) | attribute compare number | attribute 'in' list # compare = '=' | '>' | '>=' | '<' | '<=' ; def parse_cond_expr(s): attribute = tok(s) # No matter what, we have a left and a right hand side # separated by an operator if not (attribute.type == 'attribute'): raise SyntaxError("Unexpected token {0}, expected attribute name".format( attribute.type )) # consume the attribute value next(s) # Now we must see a comparator compare = tok(s) if not compare.type in ['compare', 'regexmatch', 'in']: raise SyntaxError("Expected a comparison operator, regex match or 'in' keyword, got {0}".format( compare )) # consume the comparison next(s) # do some processing based on the type of operator if compare.type=='in': rterm = parse_list(s) elif compare.type=='compare': #rterm = parse_expr(s) # we only support numbers here rterm = tok(s) if not (rterm.type=='int'): raise SyntaxError("Unexpected token {0}, expected a number here".format( rterm )) # and consume the number rterm = rterm.value next(s) else: # must've been regexmatch rterm = parse_rx(s) # it better exist if rterm is None: raise SyntaxError("Failed to parse right-hand-term of cond_expr (%s)" % tok(s)) print("HAVE rterm=",rterm) return lambda ds: compare.value(attribute.value(ds), rterm) def parse_paren_condition(s): lparen = tok(s) if lparen.type!='lparen': raise RuntimeError("Entered parse_paren_condition w/o left paren but %s" % lparen) depth = s.depth s.depth = s.depth + 1 # recurse into parsing the expression - and do NOT forget to consume the lparen! expr = parse_condition(next(s)) # now we should be back at the same depth AND we should be seeing rparen rparen = tok(s) if rparen.type=='rparen': s.depth = s.depth - 1 next(s) return expr def parse_rx(s): # we accept string, literal and regex and return an rx object rx = tok(s) if not rx.type in ['regex', 'text', 'literal']: raise SyntaxError("Failed to parse string matching regex (not regex, text or literal but %s)" % rx) # consume the token next(s) if rx.type=='literal': # extract the pattern from the literal (ie strip the leading/trailing "'" characters) rx.value = rx.value[1:-1] if rx.type in ['text', 'literal']: rx.value = pattern2regex(rx.value) return rx.value def parse_list(s): bracket = tok(s) if bracket.type != 'lbracket': raise SyntaxError("Expected list-open bracket ('[') but found %s" % bracket) rv = [] # keep eating text + ',' until we read 'rbracket' next(s) while tok(s).type!='rbracket': # if we end up here we KNOW we have a non-empty list because # the next token after '[' was NOT ']' # Thus if we need a comma, we could also be seeing ']' needcomma = len(rv)>0 #print " ... needcomma=",needcomma," current token=",tok(s) if needcomma: if tok(s).type=='rbracket': continue if tok(s).type!='comma': raise SyntaxError("Badly formed list at {0}".format(tok(s))) # and eat the comma next(s) # now we need a value. 'identifier' is also an acceptable blob of text rv.extend( parse_list_item(s) ) #print "parse_list: ",rv # and consume the rbracket (if not rbracket a syntax error is raised above) next(s) return rv # always returns a list-of-items; suppose the list item was an irange def parse_list_item(s): t = tok(s) # current token must be 'text' or 'irange' if not t.type in ['text', 'irange', 'int', 'float', 'literal']: raise SyntaxError("Failure to parse list-item {0}".format(t)) next(s) # for a literal, strip the leading and closing single quote if t.type == 'literal': t.value = t.value[1:-1] return t.value if t.type == 'irange' else [t.value] class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 next(self) def __next__(self): self.token = next(self.tokenstream) return self next = __next__ tokenizer = mk_tokenizer(tokens, **kwargs) return parse_filter(state_type(tokenizer(qry))) ######################################################################################################### # # animation parser # # Let the user animate datasets based on dataset attribute value(s) # *after* they've been read from disk # ######################################################################################################### ###### Our grammar # animate <selection> by <attributes> [with <options>] <eof> # (The 'animate' keyword is taken to be matched out in the command parser) # # # empty selection means "current" # <selection> = "" | <dataset> # <attributes> = <attribute> { ',' <attributes> } # # <dataset> = {<identifier> ':'} <expression> # <identifier> = [a-zA-Z][0-9a-zA-Z]* # alphanumeric variable name # # <attribute> = <attrname> { <sortorder> } # <attrname> = 'time' | 'src' | 'bl' | 'p' | 'sb' | 'ch' | 'type' # <sortorder> = 'asc' | 'desc' # # <options> = <option> { ',', <options> } # <option> = 'fps' '=' <float> # # <expression> = <expr> { 'and' <expression> | 'or' <expression> } # <expr> = <condition> | 'not' <expression> | '(' <expression> ')' # <condition> = <attrname> <relop> <value> | # <attrname> 'in' <list> | # <attrname> 'like' <regex> | # <attrname> 'like' <text> # <relop> = '<' | '<=' | '=' | '>' | '>=' # <list> = '[' <listitems> ']' | <intrange> # <listitems> = <listitem> {',' <listitems> } # <listitem> = <value> | <intrange> # <intrange> = <int>':'<int> # <value> = <number> | <text> # <text> = ''' <characters> ''' # <regex> = '/' <text> '/' # <number> = [0-9]+ # <float> = (see FLOAT above) # condition = condexpr {relop condition} | 'not' condition | '(' condition ')' # #condexpr = attribute '~' (regex|text) | attribute compare expr | attribute 'in' list # condexpr = attribute '~' (regex|text) | attribute compare number | attribute 'in' list # attribute = 'P' | 'CH' | 'SB' | 'FQ' | 'BL' | 'TIME' | 'SRC' | # 'p' | 'ch' | 'sb' | 'fq' | 'bl' | 'time' | 'src' # # compare = '=' | '>' | '>=' | '<' | '<=' ; # relop = 'and' | 'or' ; # list = '[' [value {',' value}] ']' # value = number | text # regex = '/' {anychar - '/'} '/' ['i'] ('i' is the case-insensitive match flag) # number = digit { number } # digit = [0-9] # text = char { alpha } # char = [a-zA-Z_] # alpha = char | digit { alpha } # def parse_animate_expr(qry, **kwargs): # Helper functions def mk_intrange(txt): return hvutil.expand_string_range(txt, rchar='-') # take a string and make a "^...$" regex out of it, # doing escaping of regex special chars and # transforming "*" into ".*" and "?" into "." # (basically shell regex => normal regex) def pattern2regex(s): s = reduce(lambda acc, x: re.sub(x, x, acc), [r"\+", r"\-", r"\."], s) s = reduce(lambda acc, t_r: re.sub(t_r[0], t_r[1], acc), [(r"\*+", r".*"), (r"\?", r".")], s) return re.compile(r"^"+s+"$") def regex2regex(s): flagmap = {"i": re.I, None:0} mo = re.match(r"(.)(?P<pattern>.+)\1(?P<flag>.)?", s) if not mo: raise RuntimeError("'{0}' does not match the regex pattern /.../i?".format(s)) return re.compile(mo.group('pattern'), flagmap[mo.group('flag')]) # basic lexical elements # These are the tokens for the tokenizer tokens = [ token_def(r"\b(animate|by|asc|desc|with)\b", keyword_t()), # the attribute names we support #(re.compile(r"\b(p|ch|sb|fq|bl|time|src|type)\b", re.I), xform_t('attribute', mk_attribute_getter)), (re.compile(r"\b(p|ch|sb|fq|bl|time|src|type)\b", re.I), xform_t('attribute', str.upper)), #(re.compile(r"\b(p|ch|sb|fq|bl|time|src|type)\b", re.I), value_t('attribute')), # Date + time formats datetime_token(YMD, TIME), datetime_token(YMD, HMS), datetime_token(DMY, TIME), datetime_token(DMY, HMS), datetime_token(DMY_EUR, TIME), datetime_token(DMY_EUR, HMS), # Relative day offset - note: assume that the global variable # 'start' is set correctly ... token_def(RELDAY+TIME, datetime_t(mk_seconds)), token_def(RELDAY+HMS, datetime_t(mk_seconds)), # Time durations token_def(RELDAY+DUR3, datetime_t(mk_seconds)), token_def(RELDAY+DUR2, datetime_t(mk_seconds)), token_def(RELDAY+DUR1, datetime_t(mk_seconds)), token_def(DUR4, xformmg_t('duration', mk_seconds)), token_def(DUR3, xformmg_t('duration', mk_seconds)), token_def(DUR2, xformmg_t('duration', mk_seconds)), token_def(DUR1, xformmg_t('duration', mk_seconds)), # operators token_def(r"\bnot\b", operator_t('not')), token_def(r"\bin\b", operator_t('in')), token_def(r"\b(and|or)\b", operator_t('relop')), token_def(r"(<=|>=|=|<|>)", operator_t('compare')), token_def(r"(~|\blike\b)", xform_t('regexmatch', lambda o, **k: lambda x, y: re.match(y, x) is not None)), # parens + list stuff token_def(r"\(", simple_t('lparen')), token_def(r"\)", simple_t('rparen')), token_def(r"\[", simple_t('lbracket')), token_def(r"\]", simple_t('rbracket')), token_def(r",", simple_t('comma')), token_def(r":", simple_t('colon')), token_def(r"-|\+|\*|/", operator_t('operator')), # values + regex float_token(), int_token(), token_def(r"/[^/]+/i?\b", xform_t('regex', regex2regex)), token_def(r"\$(?P<sym>[a-zA-Z][a-zA-Z_]*)", resolve_t('external', 'sym')), token_def(r"'([^']*)'", extract_t('literal', 1)), token_def(r"[a-zA-Z][a-zA-Z0-9_]*", value_t('identifier')), token_def(r"\S+", value_t('text')), #token_def(r"[:@\#%!\.\*\+\-a-zA-Z0-9_?|]+", value_t('text')), # and whitespace token_def(r"\s+", ignore_t()) ] tokenizer = mk_tokenizer(tokens, **kwargs) # The output of the parsing is a filter function that returns # True or False given a dataset object #next = lambda s: s.next() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value ###### Our grammar # animate = 'animate' [<selection>] 'by' <attributes> <eof> def parse_animate(s): if tok(s).type != 'animate': raise SyntaxError("The animate expression does not start with the keyword 'animate' but with {0}".format(tok(s))) # skip that one next(s) # now we may see a selection selection_f = parse_selection(s) # check mismatched parentheses in the expression(s) if s.depth!=0: raise SyntaxError("Mismatched parentheses") # now we MUST see the 'by' keyword if tok(s).type!='by': raise SyntaxError("Unexpected token {0}, expected the 'by' keyword".format(tok(s))) # and skip that one next(s) # now we must parse the <attributes> groupby_f = parse_attributes(s) # now we may see 'with' followed by settings options = parse_options(s) # the only token left should be 'eof' AND, after consuming it, # the stream should be empty. Anything else is a syntax error try: if tok(s).type is None: next(s) except StopIteration: return (selection_f, groupby_f, options) raise SyntaxError("(at least one)token left after parsing: {0}".format(tok(s))) # <selection> = "" | <dataset> def parse_selection(s): # try to parse the dataset identifier, if it is None then there was none # and we default to "_" dataset_id = parse_dataset_id(s) #if dataset_id is None: # dataset_id = '_' # now we may see an expression filter_f = parse_expression(s) # build a filtering function on the indicated dataset return (dataset_id, filter_f) # <dataset> = {<identifier> ':'} <expression> # Note: let's allow 'attribute' here as well - the ':' following # the dataset_id would be the disambiguator between # data set identifier and attribute name? def parse_dataset_id(s): # if we don't see an identifier that means there's no identifier at all dataset_id = tok(s) if dataset_id.type not in[ 'identifier', 'attribute']: return None # if we see attribute we need to peek ahead to see if the disambiguating ':' is there if dataset_id.type == 'attribute' and s.peek().type!='colon': # rite, not a data set id return None dataset_id = dataset_id.value # eat it up and then we MUST see ':' next(s) if tok(s).type != 'colon': raise SyntaxError("Expected ':' following data set identifier") # consume and return the actual identifier next(s) return dataset_id # <expression> = <expr> { 'and' <expression> | 'or' <expression> } # <expr> = <condition> | 'not' <expression> | '(' <expression> ')' # <condition> = <attrname> <relop> <number> | def parse_expression(s): left = parse_expr(s) if left is None: return None # OK. We have a left hand side # if we're looking at a relop we may have to parse a right hand side if tok(s).type != 'relop': return left # ok looking at relop, save it for later use and move on relop = tok(s).value # now we MUST see a right hand side right = parse_expression( next(s) ) if right is None: raise SyntaxError("Missing right hand side to logical operator and or or") def mk_f(l, op, r): def do_it(ds): return op(l(ds), r(ds)) return do_it return mk_f(left, relop, right) def parse_expr(s): # depending on what token we're looking at choose the appropriate action tp = tok(s).type if tp == 'attribute': # depending on the type of the attribute ... return parse_cond_expr(s) #return parse_cond_expr(s) if tp == 'not': # consume the 'not' and return whatever is following f = parse_expression( next(s) ) return lambda ds: not f(ds) if tp == 'lparen': # remove the '(' s.depth = s.depth + 1 expr = parse_expression( next(s) ) # now we MUST see ')' [and if we do skip it] if tok(s).type != 'rparen': raise SyntaxError("Mismatched parenthesis") s.depth = s.depth - 1 next(s) if expr is None: raise SyntaxError("An empty expression is not an expression") return expr return None # term = duration | number | external def parse_time_term(s): term = tok(s) # The easy bits first if term.type in ['number', 'external', 'duration', 'datetime']: # all's well - eat this term next(s) return term.value return None # support datetime or an expression involving datetime? # expr = term | expr '+' expr | expr '-' expr | expr '*' expr | expr '/' expr | '(' expr ')' | '-' expr def parse_time_cond(s, unary=False): t = tok(s) depth = s.depth print("parse_time_cond/tok=", t, " depth=", depth) if t.type == 'lparen': s.depth = s.depth + 1 lterm = parse_time_cond( next(s) ) # now we MUST see ')' [and if we do skip it] if tok(s).type != 'rparen': raise SyntaxError("Mismatched parenthesis") s.depth = s.depth - 1 next(s) if lterm is None: raise SyntaxError("An empty expression is not an expression") return lterm elif t.type=='operator' and t.value is mk_operator('-'): # unary '-' tmpexpr = parse_time_cond(next(s), unary=True) lterm = operator.neg( tmpexpr ) else: print("parsing time term?") lterm = parse_time_term(s) print(" yields: ",lterm) # If we see an operator, we must parse the right-hand-side # (our argument is the left-hand-side # Well ... not if we're doing unary parsing! # if we saw unary '-' then we should parse parens and terms up until # the next operator oper = tok(s) if oper.type=='operator': if unary: return lterm if lterm is None: raise SyntaxError("No left-hand-side to operator {0}".format(oper)) rterm = parse_time_cond(next(s)) if rterm is None: raise SyntaxError("No right-hand-side to operator {0}".format(oper)) return oper.value(lterm, rterm) elif oper.type in ['int', 'float', 'duration', 'datetime']: # negative numbers as right hand side are not negative numbers # but are operator '-'! # so, subtracting a number means adding the negative value (which we already # have god) # Consume the number and return the operator add next(s) return operator.add(lterm, oper.value) # neither parens, terms, operators? return lterm # condexpr = attribute '~' (regex|text) | attribute compare number | attribute 'in' list # compare = '=' | '>' | '>=' | '<' | '<=' ; def parse_cond_expr(s): attribute = tok(s) # No matter what, we have a left and a right hand side # separated by an operator if not (attribute.type == 'attribute'): raise SyntaxError("Unexpected token {0}, expected attribute name".format( attribute.type )) # consume the attribute value next(s) # Now we must see a comparator # for the time attribute 'regexmatch' and 'in' don't make sense compare = tok(s) if not compare.type in (['compare'] if attribute.value == 'TIME' else ['compare', 'regexmatch', 'in']): raise SyntaxError("Invalid comparison operator {0} for attribute {1}".format( compare, attribute.value )) # consume the comparison next(s) # do some processing based on the type of operator if compare.type=='in': rterm = parse_list(s) elif compare.type=='compare': # we must compare to a value # take care of when attribute is 'time' if attribute.value == 'TIME': rterm = parse_time_cond(s) else: rterm = parse_value(s) else: # must've been regexmatch rterm = parse_rx(s) # it better exist if rterm is None: raise SyntaxError("Failed to parse right-hand-term of cond_expr (%s)" % tok(s)) return lambda ds: compare.value(mk_attribute_getter(attribute.value)(ds), rterm) # <value> = <number> | <text> def parse_value(s): value = tok(s) if value.type not in ['int', 'text', 'identifier']: raise SyntaxError("Unsupported value type - only int or text allowed here, not {0}".format( value )) # consume the value and return it next(s) return value.value def parse_rx(s): # we accept string, literal, identifier and regex and return an rx object rx = tok(s) if not rx.type in ['regex', 'text', 'literal']: raise SyntaxError("Failed to parse string matching regex (not regex, text or literal but %s)" % rx) # consume the token next(s) if rx.type in ['text', 'literal']: rx.value = pattern2regex(rx.value) return rx.value # <list> = '[' <values> ']' | <intrange> def parse_list(s): # could be actual list or int range if tok(s).type == 'lbracket': return parse_list_list(s) elif tok(s).type == 'int': return parse_int_range(s) # unexpected token raise SyntaxError("Unexpected token {0} - not a list or int range".format(tok(s))) def parse_int_range(s): # we *must* be looking at 'int' start = tok(s) if start.type != 'int': raise SyntaxError("Expected an integer here, not a {0}".format(start)) next(s) # now we must see colon if tok(s).type != 'colon': raise SyntaxError("Expected ':' to form integer range") # eat up next(s) end = tok(s) if start.type != 'int': raise SyntaxError("Expected an integer here, not a {0}".format(end)) # don't forget to consume the number next(s) return range_(start.value, end.value+1) def parse_list_list(s): bracket = tok(s) if bracket.type != 'lbracket': raise SyntaxError("Expected list-open bracket ('[') but found %s" % bracket) rv = [] # keep eating text + ',' until we read 'rbracket' next(s) while tok(s).type!='rbracket': # if we end up here we KNOW we have a non-empty list because # the next token after '[' was NOT ']' # Thus if we need a comma, we could also be seeing ']' needcomma = len(rv)>0 #print " ... needcomma=",needcomma," current token=",tok(s) if needcomma: if tok(s).type=='rbracket': continue if tok(s).type!='comma': raise SyntaxError("Badly formed list at {0}".format(tok(s))) # and eat the comma next(s) # now we need a value. 'identifier' is also an acceptable blob of text rv.extend( parse_list_item(s) ) #print "parse_list: ",rv # and consume the rbracket (if not rbracket a syntax error is raised above) next(s) return rv # always returns a list-of-items; suppose the list item was an irange def parse_list_item(s): t = tok(s) # current token must be 'text' or 'irange' if not t.type in ['text', 'irange', 'int', 'float', 'literal']: raise SyntaxError("Failure to parse list-item {0}".format(t)) next(s) return t.value if t.type == 'irange' else [t.value] # <attributes> = <attribute> { ',' <attribute> } # <attribute> = <attrname> { <sortorder> } # <attrname> = 'time' | 'src' | 'bl' | 'p' | 'sb' | 'ch' | 'type' # <sortorder> = 'asc' | 'desc' def parse_attributes(s): groupby = set() sortfns = [] while True: item = tok(s) if item.type!='attribute': raise SyntaxError("Unexpected token {0}, expected an attribute".format(item)) # check that the same attribute does not get mentioned twice if item.value in groupby: raise RuntimeError("The attribute type {0} is mentioned more than once".format(item.value)) groupby.add( item.value ) # Peek at the next token. If it's asc/desc take that into account next(s) order = tok(s) if order.type in ['asc', 'desc']: order = order.type # consume it next(s) else: # default to asc order = 'asc' # create a sorting function def mk_sf(attr, order): def do_it(x): return sorted(x, key=operator.attrgetter(attr), reverse=(order=='desc')) return do_it sortfns.append( mk_sf(item.value, order) ) #if we don't see a comma next, we break if tok(s).type!='comma': break # consume the comma next(s) # primary sort key is now first in list but for the sorting to work in steps # (see https://wiki.python.org/moin/HowTo/Sorting ) we must apply the sorting # functions in reverse order return (operator.attrgetter(*groupby), lambda x: reduce(lambda acc, sortfn: sortfn(acc), reversed(sortfns), x)) # parse the options, if any def parse_options(s): options = type('',(),{})() # are we looking at the correct keyword, 'with'? if tok(s).type!='with': return options # stay here to parse key=value as long as we find'em next(s) while True: option = parse_option(s) setattr(options, option[0], option[1]) # if we don't see ',' we assume there's no more options # so break from the loop and let the next step check validity # of input next(s) if tok(s).type != 'comma': break return options option_type_map = {'fps': (float, ['int','float'])} def parse_option(s): # need to see an identifier key = tok(s) if key.type != 'identifier': raise SyntaxError("option specifier does not start with an identifier (found {0})".format(key)) # next up '=' next(s) equal = tok(s) if equal.type != 'compare' or equal.value != operator.eq: raise SyntaxError("Did not find '=' after option key but {0}".format(equal)) # depending on recognized option type look for specific following token (convert, expect) = option_type_map.get( key.value.lower(), (lambda x: x, None) ) next(s) got = tok(s) if expect is not None and got.type not in expect: raise SyntaxError("Expected value of type {0} for key {1}, got {2} instead".format(expect, key.value, got)) # convert the parsed value return (key.value.lower(), convert(got.value)) class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 self.inquote = False self.lookAhead = [] next(self) def peek(self): self.lookAhead.append( next(self.tokenstream) ) return self.lookAhead[-1] def __next__(self): if self.lookAhead: self.token = self.lookAhead.pop(0) else: self.token = next(self.tokenstream) return self next = __next__ tokenizer = mk_tokenizer(tokens, **kwargs) return parse_animate(state_type(tokenizer(qry))) #Need to be able to parse a baseline expression: # bl cross -ef* +ef(mc|ys) # but also: digits for antenna number, numbers-as-strings # (AIPS export + importfitsidi produces antenna *names* # which are string representations of numbers ...) # # input = 'bl' {<selectors>} EOF # selectors = <selector> { <selectors> } # selector = {<add_or_remove>} <pattern> # pattern = 'auto' | 'cross' | <match> # match = <ant><ant> # ant = '*' | <expression> # expression = <part> | '(' <part> ['|' <part>] ')' # part = [a-zA-Z0-9_%]+ | <quote> <part> <quote> # quote = '"' | ''' # The selection function is a function which, when passed the baselineMap # returns a list of selected baselines and the accompanying TaQL # # We're going to take 'selection.baselineMap' ("blmap_") and transform # blmap_.baselineList # which is [ ((x_ant, y_ant), "XAntYAnt"), ... ] # into: # blmap_' = [ (x_ant, y_ant, "XAnt", "YAnt", "XAntYAnt", "YAntXAnt"), ... ] # for easier matching on individual components # # In both forms: # x_ant, y_ant: antenna numbers # XAnt, YAnt : antenna name # The parsing results in an object with two properties: # * the selected baselines (strings) # * the accompanying TaQL class ParseResult: __slots__ = ['baselines', 'taql', 'nExpr'] def __init__(self): self.baselines = None #set() self.taql = '' self.nExpr = 0 __str__ = __repr__ = method("ParseResult[baselines={0.baselines}, taql='{0.taql}']".format) ####################################################################### # We support these binary operations: # * add to current selection # * remove from current selection ####################################################################### # transform subselector function f into a function # that adds or removes to the current selection and updates the TaQL expression # op = set.union | set.difference # taqlop = 'OR' | 'AND NOT' def mk_select_action(expression, f, op, taqlop): # pr_acc = ParseResult accumulator def do_it(pr_acc, blm): #print("DOING_IT!\n\tpr_acc(in)=",pr_acc) # first, run the selection function on the total set of # baselines to figure out which baselines to add (selected, taql) = f(blm) # the only valid selector that gives an empty set is the one # with taql == 'FALSE' (the result of parsing the 'none' token) if not selected and taql != 'FALSE': print("WARN: your expression '{0}' did not match anything".format(expression)) #print("\tselected=",selected,"\n\ttaql=",taql) pr_acc.baselines = op(set() if pr_acc.baselines is None else pr_acc.baselines, selected) # if by now all baselines are selected, there's really no point # in adding more to the TaQL, we might as well remove it if pr_acc.baselines >= set(blm): pr_acc.taql = '' return pr_acc # only modify taql if # 1) if this expression selected anything at all, and # 2) if any taql associated with the expression if taql: # only combine taql if we had a previous taql expr pr_acc.taql = pr_acc.taql + " {0} ({1})".format(taqlop, taql) if pr_acc.taql else taql return pr_acc return do_it # This method is appended as final "mk_select_action()" # and in case the selection is empty verifies the taql is 'FALSE' # If that is true, the taql is replaced with '' (no taql) such # that a "bl none" command effectively removes any bl selection taql def fixup_bl_none(pr_acc, blm): if not pr_acc.baselines: if pr_acc.taql != 'FALSE': raise RuntimeError("Your baseline selection yielded no matches!") pr_acc.taql = '' if pr_acc.taql: pr_acc.taql = '(' + pr_acc.taql + ')' return pr_acc setoperatormap = {'+': set.union, '-': set.difference} taqlmap = {'+': 'OR', '-': 'AND NOT'} operator2set = lambda op: setoperatormap.get(op, set.union) # cooked selectors cookmap = { # 'all' and 'none' are sort of synonyms but not quite # By giving 'all' a bit of TaQL this becomes valid: "bl all -wb" # "bl none +wb" would be valid and equal 'all': lambda i: (set(i), 'TRUE'), 'none': lambda i: (set(), 'FALSE'), 'auto': lambda i: (set(filter(lambda tup: tup[0]==tup[1], i)), "ANTENNA1==ANTENNA2"), 'cross': lambda i: (set(filter(lambda tup: tup[0]!=tup[1], i)), "ANTENNA1!=ANTENNA2") } cooked2fn = lambda cooked: mk_select_action(cooked, cookmap.get(cooked), setoperatormap.get('+'), taqlmap.get('+')) # anything not cooked should look like: # [+-]?<ant1><ant2> class p_state_type: def __init__(self, tokstream, **kwargs): # do this first for (a, v) in kwargs.items(): setattr(self, a, v) # now we set the attributes that # have special meaning to *us* # i.e. possibly overwriting what the user put in self.tokenstream = tokstream self.depth = 0 self.inquote = False self.lookAhead = [] next(self) def peek(self): self.lookAhead.append( next(self.tokenstream) ) return self.lookAhead[-1] def __next__(self): if self.lookAhead: self.token = self.lookAhead.pop(0) #self.lookAhead[0] #self.lookAhead = None else: self.token = next(self.tokenstream) return self next = __next__ p_tok = lambda s: s.token # Deal with extracting stuff from a list of (xant, yant, "XAnt", "YAnt", ...) tuples get_ant_n = lambda n: compose(list, set, Map(GetN(n))) get_ant_1 = get_ant_n(0) get_ant_2 = get_ant_n(1) field_xform_ = {0:identity, 1:identity} field_xform = lambda n: compose(field_xform_.get(n, str.lower), GetN(n)) # TaQL building blocks for ANTENNA1 and ANTENNA2, automatically # using "ANTENNAx == number" or "ANTENNAx IN [list, of, numbers]" depending on how many conditions must be matched def ant_taql(n, ants): return "ANTENNA{0}=={1}".format(n, ants[0]) if len(ants)==1 else "ANTENNA{0} IN {1}".format(n, ants) ant1_taql = partial(ant_taql, 1) ant2_taql = partial(ant_taql, 2) # This matches either end of a baseline (e.g. only conditions given for one end of the baseline # you have to remember that the condition can apply to either ANTENNA1 or ANTENNA2 bl_taql1ant = combine("({0} OR {1})".format, ant1_taql, ant2_taql) # Single condition for baseline with constraints on both ends bl_taql2ant = lambda a1, a2: "({0} AND {1})".format(ant1_taql(a1), ant2_taql(a2)) # and now the full fledged baseline with constraints on both ends # (which means that it could also be reversed) bl_taql = lambda a1, a2: "(({0}) OR ({1}))".format(bl_taql2ant(a1, a2), bl_taql2ant(a2, a1)) # blmatch = (suggested_taql, list-of-match_fns) # each entry match-fn in list-of-match_fns must accept a baseline tuple (xant, yant, "XAnt", "YAnt", "XAntYAnt", "YAntXAnt") # and return True/False wether to include this baseline in the set of matched baselines def mk_match(blmatch): # we must return the set of selected baselines # as well as generate the TaQL (taql, conds) = blmatch def do_it(blmap): # run all conditions on the baselines, generating # to total set of matches for this list-of-conditions to_add = reduce(set.union, map(lambda c: set(filter(c, blmap)), conds)) # now generate the TaQl for it # if the expression seems to select all baselines we shortcircuit # and ignore the TaQL that the blmatch input suggested if len(to_add) == len(blmap): # this means all baselines selected ... return (blmap, '') return (to_add, taql) return do_it # mk_field_f_ takes a list of tuples # [ (set{field-ids}, match-fn, antid), (set(field-ids), match-fn, antid), ...] # where each tuple describes a match function and on which baseline tuple fields to # run this function # # and turns it into a list of tuples: # [ (field-idx, [match-fn, match-fn, ...]), (field-idx, [match-fn, match-fn]), ... ] # i.e. groups the match functions by individual field index such that after extracting # a specific field from a baseline tuple, all match functions can be run against it in one go xtract_f = Map_(GetN(1)) # tup = (field-idx, [(field-idx, match-fn), (field-idx, match-fn), ...]) def mk_match_fn(tup): get_f = field_xform(tup[0]) match_fns = xtract_f(tup[1]) def do_it(bl): return any(map(ylppa(get_f(bl)), match_fns)) return do_it mk_field_f_= compose(Map_(mk_match_fn), GroupBy(GetN(0)), Sorted(GetN(0)), set, Reduce(operator.__add__), Map(lambda t: [(i, t[1]) for i in t[0]])) # Given list of [ (set{field_ids}, match_fn, antid), (set{field_ids}, match_fn, antid), ... ] # return boolean True or False depending on there being a match_fn equal to None in there dont_care = compose(truth, Call('count', None), Map_(GetN(1))) # Given same list, return the set of antenna ids get_ant_ids= compose(list, set, Map(GetN(2))) # given conditions for ant1 and possibly ant2, generate filter function which # will, well, filter baselines matching both (if appropriate) def mk_field_f(ant1, ant2): (a1, a1f) = (None, None) if not ant1 or dont_care(ant1) else (get_ant_ids(ant1), mk_field_f_(ant1)) (a2, a2f) = (None, None) if not ant2 or dont_care(ant2) else (get_ant_ids(ant2), mk_field_f_(ant2)) if a1f is None and a2f is None: # match function is simple, no TaQL return ("", [const(True)]) if a2f is None: # there have to be conditions on ant1 # generate the TaQL based on ant1 conditions return (bl_taql1ant(a1), a1f) if a1f is None: # there have to be conditions on ant2 return (bl_taql1ant(a2), a2f) # bugger, need to find all combinations! # we need to come up with a list of functions def do_it(bl): m_bl = Map(ylppa(bl)) return any(m_bl(a1f)) and any(m_bl(a2f)) # The taql is ((antenna1 in a1 AND antenna2 in a2) OR (antenna1 in a2 and antenna2 in a1)) taql = "(({0}) OR ({1}))".format(bl_taql2ant(a1, a2), bl_taql2ant(a2, a1)) return (taql, [do_it]) # given dict of "XAnt" (name) -> xant (nr) entries, # transform into case insensitive tokens. The reverse length sort is to have the longer names # before the shorter ones to handle the case of shorter name being prefix of longer name ant_tokens = compose(Map_(lambda ant: (re.compile(r""+ant, re.I), value_t('antenna'))), Sorted(len, reverse=True), Call('keys')) class selector_parser: _tokens_ = [ token_def(r"-|\+|\*", keyword_t()), token_def(r"\(", simple_t('lparen')), token_def(r"\)", simple_t('rparen')), token_def(r"\|", simple_t('or')), token_def(r"'", simple_t('quote')), int_token(), token_def(r"\b(all|auto|cross|none)\b", keyword_t()), ] # parse the expression def __call__(self, expr, blmap, antmap): #print("START PARSING EXPR=", expr) # add the individual antenna names as tokens! # remember: blmap = [(xant, yant, "XAnt", "YAnt", "XAntYAnt", "YAntXAnt"), ....] # so extract all unique "XAnt"/"YAnt"s and transform those into tokens tokenizer = mk_tokenizer(selector_parser._tokens_ + ant_tokens(antmap)) state = p_state_type(tokenizer(expr), expression=expr) result = self.parse_expr(state, blmap, antmap) # the only token left should be 'eof' AND, after consuming it, # the stream should be empty. Anything else is a syntax error try: if p_tok(state).type is None: next(state) except StopIteration: #print("selector_parser/returning value:",result) return result raise SyntaxError("(at least one)token left after parsing selector: {0}".format(p_tok(state))) # entry point def parse_expr(self, s, blmap, antmap): # default action is to add to the current set action = '+' # if the user specified a specific action, save it and eat the token if p_tok(s).type in "+-": action = p_tok(s).type next(s) # we now expect ant1 selector # if we see a cooked entry, no ant2 possible if p_tok(s).type in ['all', 'none', 'auto', 'cross']: cooked = p_tok(s).type #print("Got cooked=", cooked) next(s) return mk_select_action(s.expression, cookmap.get(cooked), setoperatormap.get(action), taqlmap.get(action)) # Now we get to 'normal' ant1 selection ant1cond = self.parse_selector(s, blmap, antmap, True) if not ant1cond: raise SyntaxError("At least one antenna selection criterion required") ant2cond = self.parse_selector(s, blmap, antmap, False) #print("Got ant1cond=", ant1cond) #print(" ant2cond=", ant2cond) return mk_select_action(s.expression, mk_match(mk_field_f(ant1cond, ant2cond)), setoperatormap.get(action), taqlmap.get(action)) def parse_selector(self, s, blmap, antmap, require_item): options = list() # do we see lparen? if p_tok(s).type == 'lparen': s.depth = s.depth + 1 next(s) # now we start collecting selections, they may be quoted #require_item = False while True: item = self.parse_item(s, blmap, antmap) if item is None: if require_item: raise SyntaxError("Expected a baseline selection item") #break else: options.append( item ) # typically we only parse one item, unless # someone used parens tp = p_tok(s).type # by just breaking from the loop on ")" it can be either # correct or incorrect but that is easily checked outside the loop if tp == 'rparen': s.depth = s.depth - 1 next(s) break # If we don't see "|" we break (again, if we opened parens but # no closing one then that's caught outside the loop if tp != 'or': break # eat the "|" next(s) require_item = True # if s.depth != 0 we have unbalanced parentheses if s.depth != 0: raise SyntaxError("Unbalanced parentheses") return options # return tuple (set{}, match_f, antenna_id) # or None, in case not a valid token # set{} is the set of field numbers that match_f should operate on. # Note: field_numbers as in the named tuple BLMapEntry # (xant, yant, "XAnt", "YAnt", "XAntYAnt", "YAntXAnt") # 0 1 2 3 4 5 def parse_item(self, s, blmap, antmap): # we may see a quote now if p_tok(s).type == 'quote': s.inquote = not s.inquote next(s) # now we accept 'text', 'antenna', number or '*' # for the matching it is important to know if we're matching on # antenna number or on antenna name tp = p_tok(s).type rv = None if tp in ['text', 'antenna'] or s.inquote: # text match for all types on BLMapEntry fields #2 and #3 (XAnt, YAnt) aname = str(p_tok(s).value if tp in ['text', 'int', 'antenna'] else p_tok(s).type).lower() if aname not in antmap: print("WARNING: There seems to be no antenna by the name of ", aname) match_f = partial(eq, aname) rv = (set([2,3]), match_f, antmap.get(aname, -42)) next(s) else: # numeric if number, "*" without quotes is anything # Note: only consume the token if we've handled it if tp == 'int': # numerical match on field xant, yant anum = p_tok(s).value rv = (set([0, 1]), partial(eq, anum), anum) next(s) elif tp == '*': # matches anything! rv = (set(), None, -1) next(s) # only eat the quote if we're expecting one if s.inquote and p_tok(s).type == 'quote': s.inquote = not s.inquote next(s) # mismatched quotes? if s.inquote: raise SyntaxError("Unbalanced quotes") return rv ALLOWED = r"A-Za-z0-9\(\)#|@%_\*'" BLMapEntry = collections.namedtuple('BLMapEntry', ['xant', 'yant', 'XAnt', 'YAnt', 'BL', 'LB']) def parse_baseline_expr(qry, blmap, **kwargs): # basic lexical elements # These are the tokens for the tokenizer tokens = [ # the bl keyword token_def(r"\bbl\b", keyword_t()), # any sequence of non-whitespace, with optional leading +/- token_def(r"[-+]?\S+", value_t('selector')), # and whitespace token_def(r"\s+", ignore_t()) ] # Transform blmap -> blmap_ ANAME = blmap.antennaName def xform(bli): a1, a2 = bli A1, A2 = ANAME(a1), ANAME(a2) return BLMapEntry(a1, a2, A1, A2, A1+A2, A2+A1) blmap_ = map_(xform, blmap.baselineIndices()) def reductor(acc, bli): a1, a2 = bli A1, A2 = ANAME(a1).lower(), ANAME(a2).lower() acc.extend( [(A1, a1), (A2, a2)] ) return acc antmap_ = dict(reduce(reductor, blmap.baselineIndices(), list())) # a subparser for the selector selector_p = selector_parser() tok = lambda s: s.token tok_tp = lambda s: s.token.type tok_val = lambda s: s.token.value ###### Our grammar # selection = 'bl' [<selector>]* <eof> def parse_baseline(s): if tok(s).type != 'bl': raise SyntaxError("The baseline expression does not start with the keyword 'bl' but with {0}".format(tok(s))) # skip that one next(s) # now we may see one or more selections selections = parse_selection(s) # check mismatched parentheses in the expression(s) if s.depth!=0: raise SyntaxError("Mismatched parentheses") # the only token left should be 'eof' AND, after consuming it, # the stream should be empty. Anything else is a syntax error try: if tok(s).type is None: next(s) except StopIteration: # the selections are a list of functions which # update the current selection # We add one specific function to the end to evaluate the final result # and transform "bl none"'s taql of FALSE to '' (no taql, effectively) selections.append( fixup_bl_none ) return reduce(lambda acc, f: f(acc, blmap_), selections, ParseResult()) raise SyntaxError("(at least one)token left after parsing: {0}".format(tok(s))) # <selection> = <selector> { <selector> } def parse_selection(s): selectors = list() # we expect type 'selector' now! while tok(s).type == 'selector': selectors.append( selector_p( tok(s).value, blmap_, antmap_ ) ) next(s) return selectors class state_type: def __init__(self, tokstream): self.tokenstream = tokstream self.depth = 0 self.lookAhead = [] next(self) def peek(self): self.lookAhead.append( next(self.tokenstream) ) return self.lookAhead[-1] def __next__(self): if self.lookAhead: self.token = self.lookAhead.pop(0) else: self.token = next(self.tokenstream) return self next = __next__ tokenizer = mk_tokenizer(tokens, **kwargs) return parse_baseline(state_type(tokenizer(qry))) # expr = term '+' term | term '-' term | term '*' term | term '/' term | '(' expr ')' # term = duration | number | property | external # duration = \d+ 'd'[\d+ 'h'][\d+ 'm'] [\d+ ['.' \d* ] 's'] | # \d+ 'h'[\d+ 'm'] [\d+ ['.' \d* ] 's'] | # \d+ 'm' [\d+ ['.' \d* ] 's'] | # \d+ ['.' \d* ] 's' # number = int|float # property = alpha char {alpha char | digit | '_'} # will get property from scan object # external = '$' property # will look up value of property in global namespace # regex = '/' {anychar - '/'} '/' ['i'] ('i' is the case-insensitive match flag) # identifier = alpha {character} # anychar = character | symbol # character = alpha | digit # alpha = [a-zA-Z_] ; # digit = [0-9] ; # selector = attribs {'=' colorkey} # attribs = attrib {',' attrib} # attrib = attrname {'[' attrvallist ']'} # attrname = 'P' | 'CH' | 'SB' | 'FQ' | 'BL' | 'TIME' | 'SRC' | # 'p' | 'ch' | 'sb' | 'fq' | 'bl' | 'time' | 'src' # attrvallist = attrval {',' attrvallist } # attrval = number | string | regex # number = [0-9]+ # string = 'text' # colorkey = number # regex = '/' text '/' # text = all characters except the termination (http://stackoverflow.com/a/5455705/26083)