# -*- coding: utf-8 -*- """The Lucene Query DSL parser based on PLY """ import logging # TODO : add reserved chars and escaping, regex # see : https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html # https://lucene.apache.org/core/3_6_0/queryparsersyntax.html import re import ply.lex as lex import ply.yacc as yacc from izihawa_utils.exceptions import BaseError from .tree import ( QUOTES, AndOperation, Boost, Fuzzy, Group, Not, OrOperation, Phrase, Plus, Prohibit, Proximity, Range, Regex, SearchField, UnknownOperation, Word, create_operation, group_to_fieldgroup, ) class ParseError(BaseError): code = 'parse_error' level = logging.WARNING reserved = { 'AND': 'AND_OP', 'OR': 'OR_OP', 'NOT': 'NOT', 'TO': 'TO', 'to': 'TO', } # tokens of our grammar tokens = ( ['TERM', 'PHRASE', 'REGEX', 'APPROX', 'BOOST', 'MINUS', 'PLUS', 'COLUMN', 'LPAREN', 'RPAREN', 'LBRACKET', 'RBRACKET' ] + sorted(set(reserved.values())) ) # text of some simple tokens t_PLUS = r'\+(?=\S)' t_MINUS = r'\-(?=\S)' t_NOT = 'NOT' t_AND_OP = r'AND' t_OR_OP = r'OR' t_COLUMN = r'(?<=\S):(?=\S)' t_LPAREN = r'\(' t_RPAREN = r'\)' t_LBRACKET = r'(\[|\{)' t_RBRACKET = r'(\]|\})' # precedence rules precedence = ( ('left', 'OR_OP',), ('left', 'AND_OP'), ('nonassoc', 'MINUS',), ('nonassoc', 'PLUS',), ('nonassoc', 'APPROX'), ('nonassoc', 'BOOST'), ('nonassoc', 'LPAREN', 'RPAREN'), ('nonassoc', 'LBRACKET', 'TO', 'RBRACKET'), ('nonassoc', 'REGEX'), ('nonassoc', 'PHRASE'), ('nonassoc', 'TERM'), ) # term # the case of : which is used in date is problematic because it is also a delimiter # lets catch those expressions appart # Note : we must use positive look behind, because regexp engine is eager, # and it's only arrived at ':' that it will try this rule TIME_RE = r''' (?<=T\d{2}): # look behind for T and two digits: hours \d{2} # minutes (:\d{2})? # seconds ''' # this is a wide catching expression, to also include date math. # Inspired by the original lucene parser: # https://github.com/apache/lucene-solr/blob/master/lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj#L189 # We do allow the wildcards operators ('*' and '?') as our parser doesn't deal with them. TERM_RE = fr''' (?P # group term (?: [^\s:^~(){{}}[\]/,{QUOTES}+\-\\] # first char is not a space neither some char which have meanings # note: escape of "-" and "]" # and doubling of "{{}}" (because we use format) | # but \\. # we can start with an escaped character ) ([^\s:^\\~(){{}}[\]{QUOTES}] # following chars | # OR \\. # an escaped char | # OR {TIME_RE} # a time expression )* ) ''' # phrase PHRASE_RE = fr''' (?P # phrase [{QUOTES}] # opening quote (?: # repeating [^\\{QUOTES}] # - a char which is not escape or end of phrase | # OR \\. # - an escaped char )* [{QUOTES}] # closing quote )''' # r'(?P"(?:[^\\"]|\\"|\\[^"])*")' # this is quite complicated to handle \" # modifiers after term or phrase APPROX_RE = r'~(?P[0-9.]+)?' BOOST_RE = r'\^(?P[0-9.]+)?' # regex REGEX_RE = r''' (?P # regex / # open slash (?: # repeating [^\\/] # - a char which is not escape or end of regex | # OR \\. # an escaped char )* / # closing slash )''' def t_IGNORE_HANGING_SIGNS(t): r"""\s+[\+-]\s+""" pass def t_IGNORE_MAD_COLUMNS(t): r"""\s+:|:\s+|\s+:\s+""" pass def t_SEPARATOR(t): r'\s+' pass # discard separators @lex.TOKEN(TERM_RE) def t_TERM(t): # check if it is not a reserved term (an operation) t.type = reserved.get(t.value, 'TERM') # it's not, make it a Word if t.type == 'TERM': m = re.match(TERM_RE, t.value, re.VERBOSE) value = m.group("term") t.value = Word(value) return t @lex.TOKEN(PHRASE_RE) def t_PHRASE(t): m = re.match(PHRASE_RE, t.value, re.VERBOSE) value = m.group("phrase") t.value = Phrase(value) return t @lex.TOKEN(REGEX_RE) def t_REGEX(t): m = re.match(REGEX_RE, t.value, re.VERBOSE) value = m.group("regex") t.value = Regex(value) return t @lex.TOKEN(APPROX_RE) def t_APPROX(t): m = re.match(APPROX_RE, t.value) t.value = m.group("degree") return t @lex.TOKEN(BOOST_RE) def t_BOOST(t): m = re.match(BOOST_RE, t.value) t.value = m.group("force") return t # Error handling rule FIXME def t_error(t): # pragma: no cover t.lexer.skip(1) lexer = lex.lex() def p_expression_or(p): 'expression : expression OR_OP expression' p[0] = create_operation(OrOperation, p[1], p[3]) def p_expression_and(p): '''expression : expression AND_OP expression''' p[0] = create_operation(AndOperation, p[1], p[len(p) - 1]) def p_expression_implicit(p): '''expression : expression expression''' p[0] = create_operation(UnknownOperation, p[1], p[2]) def p_expression_plus(p): '''unary_expression : PLUS unary_expression''' p[0] = Plus(p[2]) def p_expression_minus(p): '''unary_expression : MINUS unary_expression''' p[0] = Prohibit(p[2]) def p_expression_not(p): '''unary_expression : NOT unary_expression''' p[0] = Not(p[2]) def p_expression_unary(p): '''expression : unary_expression''' p[0] = p[1] def p_grouping(p): 'unary_expression : LPAREN expression RPAREN' p[0] = Group(p[2]) # Will p_field_search will transform as FieldGroup if necessary def p_range(p): '''unary_expression : LBRACKET phrase_or_term TO phrase_or_term RBRACKET''' include_low = p[1] == "[" include_high = p[5] == "]" p[0] = Range(p[2], p[4], include_low, include_high) def p_field_search(p): '''unary_expression : TERM COLUMN unary_expression''' if isinstance(p[3], Group): p[3] = group_to_fieldgroup(p[3]) # for field name we take p[1].value for it was captured as a word expression p[0] = SearchField(p[1].value, p[3]) def p_quoting(p): 'unary_expression : PHRASE' p[0] = p[1] def p_proximity(p): '''unary_expression : PHRASE APPROX''' p[0] = Proximity(p[1], p[2]) def p_boosting(p): '''expression : expression BOOST''' p[0] = Boost(p[1], p[2]) def p_terms(p): '''unary_expression : TERM''' p[0] = p[1] def p_fuzzy(p): '''unary_expression : TERM APPROX''' p[0] = Fuzzy(p[1], p[2]) def p_regex(p): '''unary_expression : REGEX''' p[0] = p[1] # handling a special case, TO is reserved only in range def p_to_as_term(p): '''unary_expression : TO''' p[0] = Word(p[1]) def p_phrase_or_term(p): '''phrase_or_term : TERM | PHRASE''' p[0] = p[1] # Error rule for syntax errors # TODO : should report better def p_error(p): if p is None: p = "(probably at end of input, may be unmatch parenthesis or so)" raise ParseError(error="Syntax error in input at %r!" % p) parser = yacc.yacc() """This is the parser generated by PLY """