hyperboria/nexus/meta_api/query_extensionner/grammar/parser.py

# -*- coding: utf-8 -*-
"""The Lucene Query DSL parser based on PLY
"""

import logging
# TODO : add reserved chars and escaping, regex
# see : https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
# https://lucene.apache.org/core/3_6_0/queryparsersyntax.html
import re

import ply.lex as lex
import ply.yacc as yacc
from izihawa_utils.exceptions import BaseError

from .tree import (
    QUOTES,
    AndOperation,
    Boost,
    Fuzzy,
    Group,
    Not,
    OrOperation,
    Phrase,
    Plus,
    Prohibit,
    Proximity,
    Range,
    Regex,
    SearchField,
    UnknownOperation,
    Word,
    create_operation,
    group_to_fieldgroup,
)


class ParseError(BaseError):
    code = 'parse_error'
    level = logging.WARNING


reserved = {
    'AND': 'AND_OP',
    'OR': 'OR_OP',
    'NOT': 'NOT',
    'TO': 'TO',
    'to': 'TO',
}


# tokens of our grammar
tokens = (
    ['TERM',
     'PHRASE',
     'REGEX',
     'APPROX',
     'BOOST',
     'MINUS',
     'PLUS',
     'COLUMN',
     'LPAREN',
     'RPAREN',
     'LBRACKET',
     'RBRACKET'
     ] + sorted(set(reserved.values()))
)


# text of some simple tokens
t_PLUS = r'\+(?=\S)'
t_MINUS = r'\-(?=\S)'
t_NOT = 'NOT'
t_AND_OP = r'AND'
t_OR_OP = r'OR'
t_COLUMN = r'(?<=\S):(?=\S)'
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACKET = r'(\[|\{)'
t_RBRACKET = r'(\]|\})'

# precedence rules
precedence = (
    ('left', 'OR_OP',),
    ('left', 'AND_OP'),
    ('nonassoc', 'MINUS',),
    ('nonassoc', 'PLUS',),
    ('nonassoc', 'APPROX'),
    ('nonassoc', 'BOOST'),
    ('nonassoc', 'LPAREN', 'RPAREN'),
    ('nonassoc', 'LBRACKET', 'TO', 'RBRACKET'),
    ('nonassoc', 'REGEX'),
    ('nonassoc', 'PHRASE'),
    ('nonassoc', 'TERM'),
)

# term

# the case of : which is used in date is problematic because it is also a delimiter
# lets catch those expressions appart
# Note : we must use positive look behind, because regexp engine is eager,
# and it's only arrived at ':' that it will try this rule
TIME_RE = r'''
(?<=T\d{2}):  # look behind for T and two digits: hours
\d{2}         # minutes
(:\d{2})?     # seconds
'''
# this is a wide catching expression, to also include date math.
# Inspired by the original lucene parser:
# https://github.com/apache/lucene-solr/blob/master/lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj#L189
# We do allow the wildcards operators ('*' and '?') as our parser doesn't deal with them.

TERM_RE = fr'''
(?P<term>  # group term
  (?:
   [^\s:^~(){{}}[\]/,{QUOTES}+\-\\] # first char is not a space neither some char which have meanings
                              # note: escape of "-" and "]"
                              #       and doubling of "{{}}" (because we use format)
   |                          # but
   \\.                        # we can start with an escaped character
  )
  ([^\s:^\\~(){{}}[\]{QUOTES}]        # following chars
   |                          # OR
   \\.                        # an escaped char
   |                          # OR
   {TIME_RE}                  # a time expression
  )*
)
'''
# phrase
PHRASE_RE = fr'''
(?P<phrase>  # phrase
  [{QUOTES}]  # opening quote
  (?:        # repeating
    [^\\{QUOTES}]   # - a char which is not escape or end of phrase
    |        # OR
    \\.      # - an escaped char
  )*
  [{QUOTES}]  # closing quote
)'''
# r'(?P<phrase>"(?:[^\\"]|\\"|\\[^"])*")' # this is quite complicated to handle \"
# modifiers after term or phrase
APPROX_RE = r'~(?P<degree>[0-9.]+)?'
BOOST_RE = r'\^(?P<force>[0-9.]+)?'

# regex
REGEX_RE = r'''
(?P<regex>  # regex
  /         # open slash
  (?:       # repeating
    [^\\/]  # - a char which is not escape or end of regex
    |       # OR
    \\.     # an escaped char
  )*
  /         # closing slash
)'''


def t_IGNORE_HANGING_SIGNS(t):
    r"""\s+[\+-]\s+"""
    pass


def t_IGNORE_MAD_COLUMNS(t):
    r"""\s+:|:\s+|\s+:\s+"""
    pass


def t_SEPARATOR(t):
    r'\s+'
    pass  # discard separators


@lex.TOKEN(TERM_RE)
def t_TERM(t):
    # check if it is not a reserved term (an operation)
    t.type = reserved.get(t.value, 'TERM')
    # it's not, make it a Word
    if t.type == 'TERM':
        m = re.match(TERM_RE, t.value, re.VERBOSE)
        value = m.group("term")
        t.value = Word(value)
    return t


@lex.TOKEN(PHRASE_RE)
def t_PHRASE(t):
    m = re.match(PHRASE_RE, t.value, re.VERBOSE)
    value = m.group("phrase")
    t.value = Phrase(value)
    return t


@lex.TOKEN(REGEX_RE)
def t_REGEX(t):
    m = re.match(REGEX_RE, t.value, re.VERBOSE)
    value = m.group("regex")
    t.value = Regex(value)
    return t


@lex.TOKEN(APPROX_RE)
def t_APPROX(t):
    m = re.match(APPROX_RE, t.value)
    t.value = m.group("degree")
    return t


@lex.TOKEN(BOOST_RE)
def t_BOOST(t):
    m = re.match(BOOST_RE, t.value)
    t.value = m.group("force")
    return t


# Error handling rule FIXME
def t_error(t):  # pragma: no cover
    t.lexer.skip(1)


lexer = lex.lex()


def p_expression_or(p):
    'expression : expression OR_OP expression'
    p[0] = create_operation(OrOperation, p[1], p[3])


def p_expression_and(p):
    '''expression : expression AND_OP expression'''
    p[0] = create_operation(AndOperation, p[1], p[len(p) - 1])


def p_expression_implicit(p):
    '''expression : expression expression'''
    p[0] = create_operation(UnknownOperation, p[1], p[2])


def p_expression_plus(p):
    '''unary_expression : PLUS unary_expression'''
    p[0] = Plus(p[2])


def p_expression_minus(p):
    '''unary_expression : MINUS unary_expression'''
    p[0] = Prohibit(p[2])


def p_expression_not(p):
    '''unary_expression : NOT unary_expression'''
    p[0] = Not(p[2])


def p_expression_unary(p):
    '''expression : unary_expression'''
    p[0] = p[1]


def p_grouping(p):
    'unary_expression : LPAREN expression RPAREN'
    p[0] = Group(p[2])  # Will p_field_search will transform as FieldGroup if necessary


def p_range(p):
    '''unary_expression : LBRACKET phrase_or_term TO phrase_or_term RBRACKET'''
    include_low = p[1] == "["
    include_high = p[5] == "]"
    p[0] = Range(p[2], p[4], include_low, include_high)


def p_field_search(p):
    '''unary_expression : TERM COLUMN unary_expression'''
    if isinstance(p[3], Group):
        p[3] = group_to_fieldgroup(p[3])
    # for field name we take p[1].value for it was captured as a word expression
    p[0] = SearchField(p[1].value, p[3])


def p_quoting(p):
    'unary_expression : PHRASE'
    p[0] = p[1]


def p_proximity(p):
    '''unary_expression : PHRASE APPROX'''
    p[0] = Proximity(p[1], p[2])


def p_boosting(p):
    '''expression : expression BOOST'''
    p[0] = Boost(p[1], p[2])


def p_terms(p):
    '''unary_expression : TERM'''
    p[0] = p[1]


def p_fuzzy(p):
    '''unary_expression : TERM APPROX'''
    p[0] = Fuzzy(p[1], p[2])


def p_regex(p):
    '''unary_expression : REGEX'''
    p[0] = p[1]


# handling a special case, TO is reserved only in range
def p_to_as_term(p):
    '''unary_expression : TO'''
    p[0] = Word(p[1])


def p_phrase_or_term(p):
    '''phrase_or_term : TERM
                      | PHRASE'''
    p[0] = p[1]


# Error rule for syntax errors
# TODO : should report better
def p_error(p):
    if p is None:
        p = "(probably at end of input, may be unmatch parenthesis or so)"
    raise ParseError(error="Syntax error in input at %r!" % p)


parser = yacc.yacc()
"""This is the parser generated by PLY
"""
- feat(nexus): Bump versions - fix(nexus): Preparing configs to be published - feat(nexus): Various fixes for opening left sources - fix(nexus): Fine-tune versions 1 internal commit(s) GitOrigin-RevId: 6c834cd3f4f5f18109a159a73503700dac63b0bb 2021-04-23 17:23:02 +02:00			`# -- coding: utf-8 --`
			`"""The Lucene Query DSL parser based on PLY`
			`"""`

			`import logging`
			`# TODO : add reserved chars and escaping, regex`
			`# see : https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html`
			`# https://lucene.apache.org/core/3_6_0/queryparsersyntax.html`
			`import re`

			`import ply.lex as lex`
			`import ply.yacc as yacc`
			`from izihawa_utils.exceptions import BaseError`

			`from .tree import (`
			`QUOTES,`
			`AndOperation,`
			`Boost,`
			`Fuzzy,`
			`Group,`
			`Not,`
			`OrOperation,`
			`Phrase,`
			`Plus,`
			`Prohibit,`
			`Proximity,`
			`Range,`
			`Regex,`
			`SearchField,`
			`UnknownOperation,`
			`Word,`
			`create_operation,`
			`group_to_fieldgroup,`
			`)`


			`class ParseError(BaseError):`
			`code = 'parse_error'`
			`level = logging.WARNING`


			`reserved = {`
			`'AND': 'AND_OP',`
			`'OR': 'OR_OP',`
			`'NOT': 'NOT',`
			`'TO': 'TO',`
			`'to': 'TO',`
			`}`


			`# tokens of our grammar`
			`tokens = (`
			`['TERM',`
			`'PHRASE',`
			`'REGEX',`
			`'APPROX',`
			`'BOOST',`
			`'MINUS',`
			`'PLUS',`
			`'COLUMN',`
			`'LPAREN',`
			`'RPAREN',`
			`'LBRACKET',`
			`'RBRACKET'`
			`] + sorted(set(reserved.values()))`
			`)`


			`# text of some simple tokens`
			`t_PLUS = r'\+(?=\S)'`
			`t_MINUS = r'\-(?=\S)'`
			`t_NOT = 'NOT'`
			`t_AND_OP = r'AND'`
			`t_OR_OP = r'OR'`
			`t_COLUMN = r'(?<=\S):(?=\S)'`
			`t_LPAREN = r'\('`
			`t_RPAREN = r'\)'`
			`t_LBRACKET = r'(\[\|\{)'`
			`t_RBRACKET = r'(\]\|\})'`

			`# precedence rules`
			`precedence = (`
			`('left', 'OR_OP',),`
			`('left', 'AND_OP'),`
			`('nonassoc', 'MINUS',),`
			`('nonassoc', 'PLUS',),`
			`('nonassoc', 'APPROX'),`
			`('nonassoc', 'BOOST'),`
			`('nonassoc', 'LPAREN', 'RPAREN'),`
			`('nonassoc', 'LBRACKET', 'TO', 'RBRACKET'),`
			`('nonassoc', 'REGEX'),`
			`('nonassoc', 'PHRASE'),`
			`('nonassoc', 'TERM'),`
			`)`

			`# term`

			`# the case of : which is used in date is problematic because it is also a delimiter`
			`# lets catch those expressions appart`
			`# Note : we must use positive look behind, because regexp engine is eager,`
			`# and it's only arrived at ':' that it will try this rule`
			`TIME_RE = r'''`
			`(?<=T\d{2}): # look behind for T and two digits: hours`
			`\d{2} # minutes`
			`(:\d{2})? # seconds`
			`'''`
			`# this is a wide catching expression, to also include date math.`
			`# Inspired by the original lucene parser:`
			`# https://github.com/apache/lucene-solr/blob/master/lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj#L189`
			`# We do allow the wildcards operators ('*' and '?') as our parser doesn't deal with them.`

			`TERM_RE = fr'''`
			`(?P<term> # group term`
			`(?:`
			`[^\s:^~(){{}}[\]/,{QUOTES}+\-\\] # first char is not a space neither some char which have meanings`
			`# note: escape of "-" and "]"`
			`# and doubling of "{{}}" (because we use format)`
			`\| # but`
			`\\. # we can start with an escaped character`
			`)`
			`([^\s:^\\~(){{}}[\]{QUOTES}] # following chars`
			`\| # OR`
			`\\. # an escaped char`
			`\| # OR`
			`{TIME_RE} # a time expression`
			`)*`
			`)`
			`'''`
			`# phrase`
			`PHRASE_RE = fr'''`
			`(?P<phrase> # phrase`
			`[{QUOTES}] # opening quote`
			`(?: # repeating`
			`[^\\{QUOTES}] # - a char which is not escape or end of phrase`
			`\| # OR`
			`\\. # - an escaped char`
			`)*`
			`[{QUOTES}] # closing quote`
			`)'''`
			`# r'(?P<phrase>"(?:[^\\"]\|\\"\|\\[^"])*")' # this is quite complicated to handle \"`
			`# modifiers after term or phrase`
			`APPROX_RE = r'~(?P<degree>[0-9.]+)?'`
			`BOOST_RE = r'\^(?P<force>[0-9.]+)?'`

			`# regex`
			`REGEX_RE = r'''`
			`(?P<regex> # regex`
			`/ # open slash`
			`(?: # repeating`
			`[^\\/] # - a char which is not escape or end of regex`
			`\| # OR`
			`\\. # an escaped char`
			`)*`
			`/ # closing slash`
			`)'''`


			`def t_IGNORE_HANGING_SIGNS(t):`
			`r"""\s+[\+-]\s+"""`
			`pass`


			`def t_IGNORE_MAD_COLUMNS(t):`
			`r"""\s+:\|:\s+\|\s+:\s+"""`
			`pass`


			`def t_SEPARATOR(t):`
			`r'\s+'`
			`pass # discard separators`


			`@lex.TOKEN(TERM_RE)`
			`def t_TERM(t):`
			`# check if it is not a reserved term (an operation)`
			`t.type = reserved.get(t.value, 'TERM')`
			`# it's not, make it a Word`
			`if t.type == 'TERM':`
			`m = re.match(TERM_RE, t.value, re.VERBOSE)`
			`value = m.group("term")`
			`t.value = Word(value)`
			`return t`


			`@lex.TOKEN(PHRASE_RE)`
			`def t_PHRASE(t):`
			`m = re.match(PHRASE_RE, t.value, re.VERBOSE)`
			`value = m.group("phrase")`
			`t.value = Phrase(value)`
			`return t`


			`@lex.TOKEN(REGEX_RE)`
			`def t_REGEX(t):`
			`m = re.match(REGEX_RE, t.value, re.VERBOSE)`
			`value = m.group("regex")`
			`t.value = Regex(value)`
			`return t`


			`@lex.TOKEN(APPROX_RE)`
			`def t_APPROX(t):`
			`m = re.match(APPROX_RE, t.value)`
			`t.value = m.group("degree")`
			`return t`


			`@lex.TOKEN(BOOST_RE)`
			`def t_BOOST(t):`
			`m = re.match(BOOST_RE, t.value)`
			`t.value = m.group("force")`
			`return t`


			`# Error handling rule FIXME`
			`def t_error(t): # pragma: no cover`
			`t.lexer.skip(1)`


			`lexer = lex.lex()`


			`def p_expression_or(p):`
			`'expression : expression OR_OP expression'`
			`p[0] = create_operation(OrOperation, p[1], p[3])`


			`def p_expression_and(p):`
			`'''expression : expression AND_OP expression'''`
			`p[0] = create_operation(AndOperation, p[1], p[len(p) - 1])`


			`def p_expression_implicit(p):`
			`'''expression : expression expression'''`
			`p[0] = create_operation(UnknownOperation, p[1], p[2])`


			`def p_expression_plus(p):`
			`'''unary_expression : PLUS unary_expression'''`
			`p[0] = Plus(p[2])`


			`def p_expression_minus(p):`
			`'''unary_expression : MINUS unary_expression'''`
			`p[0] = Prohibit(p[2])`


			`def p_expression_not(p):`
			`'''unary_expression : NOT unary_expression'''`
			`p[0] = Not(p[2])`


			`def p_expression_unary(p):`
			`'''expression : unary_expression'''`
			`p[0] = p[1]`


			`def p_grouping(p):`
			`'unary_expression : LPAREN expression RPAREN'`
			`p[0] = Group(p[2]) # Will p_field_search will transform as FieldGroup if necessary`


			`def p_range(p):`
			`'''unary_expression : LBRACKET phrase_or_term TO phrase_or_term RBRACKET'''`
			`include_low = p[1] == "["`
			`include_high = p[5] == "]"`
			`p[0] = Range(p[2], p[4], include_low, include_high)`


			`def p_field_search(p):`
			`'''unary_expression : TERM COLUMN unary_expression'''`
			`if isinstance(p[3], Group):`
			`p[3] = group_to_fieldgroup(p[3])`
			`# for field name we take p[1].value for it was captured as a word expression`
			`p[0] = SearchField(p[1].value, p[3])`


			`def p_quoting(p):`
			`'unary_expression : PHRASE'`
			`p[0] = p[1]`


			`def p_proximity(p):`
			`'''unary_expression : PHRASE APPROX'''`
			`p[0] = Proximity(p[1], p[2])`


			`def p_boosting(p):`
			`'''expression : expression BOOST'''`
			`p[0] = Boost(p[1], p[2])`


			`def p_terms(p):`
			`'''unary_expression : TERM'''`
			`p[0] = p[1]`


			`def p_fuzzy(p):`
			`'''unary_expression : TERM APPROX'''`
			`p[0] = Fuzzy(p[1], p[2])`


			`def p_regex(p):`
			`'''unary_expression : REGEX'''`
			`p[0] = p[1]`


			`# handling a special case, TO is reserved only in range`
			`def p_to_as_term(p):`
			`'''unary_expression : TO'''`
			`p[0] = Word(p[1])`


			`def p_phrase_or_term(p):`
			`'''phrase_or_term : TERM`
			`\| PHRASE'''`
			`p[0] = p[1]`


			`# Error rule for syntax errors`
			`# TODO : should report better`
			`def p_error(p):`
			`if p is None:`
			`p = "(probably at end of input, may be unmatch parenthesis or so)"`
			`raise ParseError(error="Syntax error in input at %r!" % p)`


			`parser = yacc.yacc()`
			`"""This is the parser generated by PLY`
			`"""`