mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-01-07 09:15:58 +01:00
331 lines
7.3 KiB
Python
331 lines
7.3 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""The Lucene Query DSL parser based on PLY
|
||
|
"""
|
||
|
|
||
|
import logging
|
||
|
# TODO : add reserved chars and escaping, regex
|
||
|
# see : https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
|
||
|
# https://lucene.apache.org/core/3_6_0/queryparsersyntax.html
|
||
|
import re
|
||
|
|
||
|
import ply.lex as lex
|
||
|
import ply.yacc as yacc
|
||
|
from izihawa_utils.exceptions import BaseError
|
||
|
|
||
|
from .tree import (
|
||
|
QUOTES,
|
||
|
AndOperation,
|
||
|
Boost,
|
||
|
Fuzzy,
|
||
|
Group,
|
||
|
Not,
|
||
|
OrOperation,
|
||
|
Phrase,
|
||
|
Plus,
|
||
|
Prohibit,
|
||
|
Proximity,
|
||
|
Range,
|
||
|
Regex,
|
||
|
SearchField,
|
||
|
UnknownOperation,
|
||
|
Word,
|
||
|
create_operation,
|
||
|
group_to_fieldgroup,
|
||
|
)
|
||
|
|
||
|
|
||
|
class ParseError(BaseError):
|
||
|
code = 'parse_error'
|
||
|
level = logging.WARNING
|
||
|
|
||
|
|
||
|
reserved = {
|
||
|
'AND': 'AND_OP',
|
||
|
'OR': 'OR_OP',
|
||
|
'NOT': 'NOT',
|
||
|
'TO': 'TO',
|
||
|
'to': 'TO',
|
||
|
}
|
||
|
|
||
|
|
||
|
# tokens of our grammar
|
||
|
tokens = (
|
||
|
['TERM',
|
||
|
'PHRASE',
|
||
|
'REGEX',
|
||
|
'APPROX',
|
||
|
'BOOST',
|
||
|
'MINUS',
|
||
|
'PLUS',
|
||
|
'COLUMN',
|
||
|
'LPAREN',
|
||
|
'RPAREN',
|
||
|
'LBRACKET',
|
||
|
'RBRACKET'
|
||
|
] + sorted(set(reserved.values()))
|
||
|
)
|
||
|
|
||
|
|
||
|
# text of some simple tokens
|
||
|
t_PLUS = r'\+(?=\S)'
|
||
|
t_MINUS = r'\-(?=\S)'
|
||
|
t_NOT = 'NOT'
|
||
|
t_AND_OP = r'AND'
|
||
|
t_OR_OP = r'OR'
|
||
|
t_COLUMN = r'(?<=\S):(?=\S)'
|
||
|
t_LPAREN = r'\('
|
||
|
t_RPAREN = r'\)'
|
||
|
t_LBRACKET = r'(\[|\{)'
|
||
|
t_RBRACKET = r'(\]|\})'
|
||
|
|
||
|
# precedence rules
|
||
|
precedence = (
|
||
|
('left', 'OR_OP',),
|
||
|
('left', 'AND_OP'),
|
||
|
('nonassoc', 'MINUS',),
|
||
|
('nonassoc', 'PLUS',),
|
||
|
('nonassoc', 'APPROX'),
|
||
|
('nonassoc', 'BOOST'),
|
||
|
('nonassoc', 'LPAREN', 'RPAREN'),
|
||
|
('nonassoc', 'LBRACKET', 'TO', 'RBRACKET'),
|
||
|
('nonassoc', 'REGEX'),
|
||
|
('nonassoc', 'PHRASE'),
|
||
|
('nonassoc', 'TERM'),
|
||
|
)
|
||
|
|
||
|
# term
|
||
|
|
||
|
# the case of : which is used in date is problematic because it is also a delimiter
|
||
|
# lets catch those expressions appart
|
||
|
# Note : we must use positive look behind, because regexp engine is eager,
|
||
|
# and it's only arrived at ':' that it will try this rule
|
||
|
TIME_RE = r'''
|
||
|
(?<=T\d{2}): # look behind for T and two digits: hours
|
||
|
\d{2} # minutes
|
||
|
(:\d{2})? # seconds
|
||
|
'''
|
||
|
# this is a wide catching expression, to also include date math.
|
||
|
# Inspired by the original lucene parser:
|
||
|
# https://github.com/apache/lucene-solr/blob/master/lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj#L189
|
||
|
# We do allow the wildcards operators ('*' and '?') as our parser doesn't deal with them.
|
||
|
|
||
|
TERM_RE = fr'''
|
||
|
(?P<term> # group term
|
||
|
(?:
|
||
|
[^\s:^~(){{}}[\]/,{QUOTES}+\-\\] # first char is not a space neither some char which have meanings
|
||
|
# note: escape of "-" and "]"
|
||
|
# and doubling of "{{}}" (because we use format)
|
||
|
| # but
|
||
|
\\. # we can start with an escaped character
|
||
|
)
|
||
|
([^\s:^\\~(){{}}[\]{QUOTES}] # following chars
|
||
|
| # OR
|
||
|
\\. # an escaped char
|
||
|
| # OR
|
||
|
{TIME_RE} # a time expression
|
||
|
)*
|
||
|
)
|
||
|
'''
|
||
|
# phrase
|
||
|
PHRASE_RE = fr'''
|
||
|
(?P<phrase> # phrase
|
||
|
[{QUOTES}] # opening quote
|
||
|
(?: # repeating
|
||
|
[^\\{QUOTES}] # - a char which is not escape or end of phrase
|
||
|
| # OR
|
||
|
\\. # - an escaped char
|
||
|
)*
|
||
|
[{QUOTES}] # closing quote
|
||
|
)'''
|
||
|
# r'(?P<phrase>"(?:[^\\"]|\\"|\\[^"])*")' # this is quite complicated to handle \"
|
||
|
# modifiers after term or phrase
|
||
|
APPROX_RE = r'~(?P<degree>[0-9.]+)?'
|
||
|
BOOST_RE = r'\^(?P<force>[0-9.]+)?'
|
||
|
|
||
|
# regex
|
||
|
REGEX_RE = r'''
|
||
|
(?P<regex> # regex
|
||
|
/ # open slash
|
||
|
(?: # repeating
|
||
|
[^\\/] # - a char which is not escape or end of regex
|
||
|
| # OR
|
||
|
\\. # an escaped char
|
||
|
)*
|
||
|
/ # closing slash
|
||
|
)'''
|
||
|
|
||
|
|
||
|
def t_IGNORE_HANGING_SIGNS(t):
|
||
|
r"""\s+[\+-]\s+"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def t_IGNORE_MAD_COLUMNS(t):
|
||
|
r"""\s+:|:\s+|\s+:\s+"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def t_SEPARATOR(t):
|
||
|
r'\s+'
|
||
|
pass # discard separators
|
||
|
|
||
|
|
||
|
@lex.TOKEN(TERM_RE)
|
||
|
def t_TERM(t):
|
||
|
# check if it is not a reserved term (an operation)
|
||
|
t.type = reserved.get(t.value, 'TERM')
|
||
|
# it's not, make it a Word
|
||
|
if t.type == 'TERM':
|
||
|
m = re.match(TERM_RE, t.value, re.VERBOSE)
|
||
|
value = m.group("term")
|
||
|
t.value = Word(value)
|
||
|
return t
|
||
|
|
||
|
|
||
|
@lex.TOKEN(PHRASE_RE)
|
||
|
def t_PHRASE(t):
|
||
|
m = re.match(PHRASE_RE, t.value, re.VERBOSE)
|
||
|
value = m.group("phrase")
|
||
|
t.value = Phrase(value)
|
||
|
return t
|
||
|
|
||
|
|
||
|
@lex.TOKEN(REGEX_RE)
|
||
|
def t_REGEX(t):
|
||
|
m = re.match(REGEX_RE, t.value, re.VERBOSE)
|
||
|
value = m.group("regex")
|
||
|
t.value = Regex(value)
|
||
|
return t
|
||
|
|
||
|
|
||
|
@lex.TOKEN(APPROX_RE)
|
||
|
def t_APPROX(t):
|
||
|
m = re.match(APPROX_RE, t.value)
|
||
|
t.value = m.group("degree")
|
||
|
return t
|
||
|
|
||
|
|
||
|
@lex.TOKEN(BOOST_RE)
|
||
|
def t_BOOST(t):
|
||
|
m = re.match(BOOST_RE, t.value)
|
||
|
t.value = m.group("force")
|
||
|
return t
|
||
|
|
||
|
|
||
|
# Error handling rule FIXME
|
||
|
def t_error(t): # pragma: no cover
|
||
|
t.lexer.skip(1)
|
||
|
|
||
|
|
||
|
lexer = lex.lex()
|
||
|
|
||
|
|
||
|
def p_expression_or(p):
|
||
|
'expression : expression OR_OP expression'
|
||
|
p[0] = create_operation(OrOperation, p[1], p[3])
|
||
|
|
||
|
|
||
|
def p_expression_and(p):
|
||
|
'''expression : expression AND_OP expression'''
|
||
|
p[0] = create_operation(AndOperation, p[1], p[len(p) - 1])
|
||
|
|
||
|
|
||
|
def p_expression_implicit(p):
|
||
|
'''expression : expression expression'''
|
||
|
p[0] = create_operation(UnknownOperation, p[1], p[2])
|
||
|
|
||
|
|
||
|
def p_expression_plus(p):
|
||
|
'''unary_expression : PLUS unary_expression'''
|
||
|
p[0] = Plus(p[2])
|
||
|
|
||
|
|
||
|
def p_expression_minus(p):
|
||
|
'''unary_expression : MINUS unary_expression'''
|
||
|
p[0] = Prohibit(p[2])
|
||
|
|
||
|
|
||
|
def p_expression_not(p):
|
||
|
'''unary_expression : NOT unary_expression'''
|
||
|
p[0] = Not(p[2])
|
||
|
|
||
|
|
||
|
def p_expression_unary(p):
|
||
|
'''expression : unary_expression'''
|
||
|
p[0] = p[1]
|
||
|
|
||
|
|
||
|
def p_grouping(p):
|
||
|
'unary_expression : LPAREN expression RPAREN'
|
||
|
p[0] = Group(p[2]) # Will p_field_search will transform as FieldGroup if necessary
|
||
|
|
||
|
|
||
|
def p_range(p):
|
||
|
'''unary_expression : LBRACKET phrase_or_term TO phrase_or_term RBRACKET'''
|
||
|
include_low = p[1] == "["
|
||
|
include_high = p[5] == "]"
|
||
|
p[0] = Range(p[2], p[4], include_low, include_high)
|
||
|
|
||
|
|
||
|
def p_field_search(p):
|
||
|
'''unary_expression : TERM COLUMN unary_expression'''
|
||
|
if isinstance(p[3], Group):
|
||
|
p[3] = group_to_fieldgroup(p[3])
|
||
|
# for field name we take p[1].value for it was captured as a word expression
|
||
|
p[0] = SearchField(p[1].value, p[3])
|
||
|
|
||
|
|
||
|
def p_quoting(p):
|
||
|
'unary_expression : PHRASE'
|
||
|
p[0] = p[1]
|
||
|
|
||
|
|
||
|
def p_proximity(p):
|
||
|
'''unary_expression : PHRASE APPROX'''
|
||
|
p[0] = Proximity(p[1], p[2])
|
||
|
|
||
|
|
||
|
def p_boosting(p):
|
||
|
'''expression : expression BOOST'''
|
||
|
p[0] = Boost(p[1], p[2])
|
||
|
|
||
|
|
||
|
def p_terms(p):
|
||
|
'''unary_expression : TERM'''
|
||
|
p[0] = p[1]
|
||
|
|
||
|
|
||
|
def p_fuzzy(p):
|
||
|
'''unary_expression : TERM APPROX'''
|
||
|
p[0] = Fuzzy(p[1], p[2])
|
||
|
|
||
|
|
||
|
def p_regex(p):
|
||
|
'''unary_expression : REGEX'''
|
||
|
p[0] = p[1]
|
||
|
|
||
|
|
||
|
# handling a special case, TO is reserved only in range
|
||
|
def p_to_as_term(p):
|
||
|
'''unary_expression : TO'''
|
||
|
p[0] = Word(p[1])
|
||
|
|
||
|
|
||
|
def p_phrase_or_term(p):
|
||
|
'''phrase_or_term : TERM
|
||
|
| PHRASE'''
|
||
|
p[0] = p[1]
|
||
|
|
||
|
|
||
|
# Error rule for syntax errors
|
||
|
# TODO : should report better
|
||
|
def p_error(p):
|
||
|
if p is None:
|
||
|
p = "(probably at end of input, may be unmatch parenthesis or so)"
|
||
|
raise ParseError(error="Syntax error in input at %r!" % p)
|
||
|
|
||
|
|
||
|
parser = yacc.yacc()
|
||
|
"""This is the parser generated by PLY
|
||
|
"""
|