| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 | # |
|---|
| 5 | # This software is licensed as described in the file COPYING, which |
|---|
| 6 | # you should have received as part of this distribution. |
|---|
| 7 | # |
|---|
| 8 | |
|---|
| 9 | """ |
|---|
| 10 | A generalised query parser and matcher. |
|---|
| 11 | """ |
|---|
| 12 | |
|---|
| 13 | import re |
|---|
| 14 | |
|---|
| 15 | from trac.core import TracError |
|---|
| 16 | |
|---|
| 17 | from tractags.api import _ |
|---|
| 18 | |
|---|
| 19 | __all__ = ['Query', 'InvalidQuery'] |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | class InvalidQuery(TracError): |
|---|
| 23 | """Raised when a query is invalid.""" |
|---|
| 24 | |
|---|
| 25 | |
|---|
| 26 | class QueryNode(object): |
|---|
| 27 | """A query parse node. |
|---|
| 28 | |
|---|
| 29 | >>> QueryNode(QueryNode.TERM, 'one') |
|---|
| 30 | ("one") |
|---|
| 31 | >>> QueryNode(QueryNode.AND, |
|---|
| 32 | ... left=QueryNode(QueryNode.TERM, 'one'), |
|---|
| 33 | ... right=QueryNode(QueryNode.TERM, 'two')) |
|---|
| 34 | (and |
|---|
| 35 | ("one") |
|---|
| 36 | ("two")) |
|---|
| 37 | >>> QueryNode(QueryNode.NOT, left=QueryNode(QueryNode.TERM, 'one')) |
|---|
| 38 | (not |
|---|
| 39 | ("one") |
|---|
| 40 | nil) |
|---|
| 41 | """ |
|---|
| 42 | |
|---|
| 43 | |
|---|
| 44 | # TODO Separate lexer and parser identifiers |
|---|
| 45 | NULL = 0 |
|---|
| 46 | TERM = 1 |
|---|
| 47 | NOT = 2 |
|---|
| 48 | AND = 3 |
|---|
| 49 | OR = 4 |
|---|
| 50 | ATTR = 5 |
|---|
| 51 | BEGINSUB = 6 |
|---|
| 52 | ENDSUB = 7 |
|---|
| 53 | |
|---|
| 54 | __slots__ = ('type', 'value', 'left', 'right') |
|---|
| 55 | |
|---|
| 56 | _type_map = {None: 'null', NULL: 'null', TERM: 'term', NOT: 'not', AND: |
|---|
| 57 | 'and', OR: 'or', ATTR: 'attr'} |
|---|
| 58 | |
|---|
| 59 | def __init__(self, type, value=None, left=None, right=None): |
|---|
| 60 | self.type = type |
|---|
| 61 | self.value = value |
|---|
| 62 | self.left = left |
|---|
| 63 | self.right = right |
|---|
| 64 | |
|---|
| 65 | def __repr__(self): |
|---|
| 66 | def show(node, depth=0): |
|---|
| 67 | if node.type == QueryNode.TERM: |
|---|
| 68 | text = u'%s("%s"' % (u' ' * depth, node.value) |
|---|
| 69 | elif node.type == QueryNode.ATTR: |
|---|
| 70 | text = u'%s(attr' % (u' ' * depth,) |
|---|
| 71 | else: |
|---|
| 72 | text = u'%s(%s%s' % (u' ' * depth, self._type_map[node.type], |
|---|
| 73 | node.value and u' "%s"' % (node.value,) or |
|---|
| 74 | u'') |
|---|
| 75 | if node.left or node.right: |
|---|
| 76 | text += u'\n' |
|---|
| 77 | if node.left: |
|---|
| 78 | text += show(node.left, depth + 1) |
|---|
| 79 | else: |
|---|
| 80 | text += u'%snil' % (u' ' * (depth + 1)) |
|---|
| 81 | text += u'\n' |
|---|
| 82 | if node.right: |
|---|
| 83 | text += show(node.right, depth + 1) |
|---|
| 84 | else: |
|---|
| 85 | text += u'%snil' % (u' ' * (depth + 1)) |
|---|
| 86 | text += u')' |
|---|
| 87 | return text |
|---|
| 88 | return show(self).encode('utf-8') |
|---|
| 89 | |
|---|
| 90 | |
|---|
| 91 | class Query(QueryNode): |
|---|
| 92 | """Query parser. |
|---|
| 93 | |
|---|
| 94 | Converts a simple query language into a parse tree which Indexers can then |
|---|
| 95 | convert into their own implementation-specific representation. |
|---|
| 96 | |
|---|
| 97 | The query language is in the following form: |
|---|
| 98 | |
|---|
| 99 | <term> <term> Document must contain all of these terms. |
|---|
| 100 | "some term" Return documents matching this exact phrase. |
|---|
| 101 | -<term> Exclude documents containing this term. |
|---|
| 102 | <term> or <term> Return documents matching either term. |
|---|
| 103 | |
|---|
| 104 | <attr>:<term> Customisable attribute matching. |
|---|
| 105 | |
|---|
| 106 | eg. |
|---|
| 107 | |
|---|
| 108 | >>> Query('lettuce tomato -cheese') |
|---|
| 109 | (and |
|---|
| 110 | ("lettuce") |
|---|
| 111 | (and |
|---|
| 112 | ("tomato") |
|---|
| 113 | (not |
|---|
| 114 | ("cheese") |
|---|
| 115 | nil))) |
|---|
| 116 | |
|---|
| 117 | >>> Query('"mint slices" -timtams') |
|---|
| 118 | (and |
|---|
| 119 | ("mint slices") |
|---|
| 120 | (not |
|---|
| 121 | ("timtams") |
|---|
| 122 | nil)) |
|---|
| 123 | |
|---|
| 124 | >>> Query('"brie cheese" or "camembert cheese"') |
|---|
| 125 | (or |
|---|
| 126 | ("brie cheese") |
|---|
| 127 | ("camembert cheese")) |
|---|
| 128 | |
|---|
| 129 | >>> Query('type:(soft or hard) (brie or camembert or cheddar)') |
|---|
| 130 | (and |
|---|
| 131 | (attr |
|---|
| 132 | ("type") |
|---|
| 133 | (or |
|---|
| 134 | ("soft") |
|---|
| 135 | ("hard"))) |
|---|
| 136 | (or |
|---|
| 137 | ("brie") |
|---|
| 138 | (or |
|---|
| 139 | ("camembert") |
|---|
| 140 | ("cheddar")))) |
|---|
| 141 | """ |
|---|
| 142 | |
|---|
| 143 | _tokenise_re = re.compile(r""" |
|---|
| 144 | (?P<not>-)| |
|---|
| 145 | (?P<or>or)\s+| |
|---|
| 146 | (?P<dquote>"(?:\\.|[^"])*")| |
|---|
| 147 | (?P<squote>'(?:\\.|[^'])*')| |
|---|
| 148 | (?P<startsub>\()| |
|---|
| 149 | (?P<endsub>\))| |
|---|
| 150 | (?P<attr>:)| |
|---|
| 151 | (?P<term>[^:()\s]+)""", re.UNICODE | re.IGNORECASE | re.VERBOSE) |
|---|
| 152 | |
|---|
| 153 | _group_map = {'dquote': QueryNode.TERM, 'squote': QueryNode.TERM, |
|---|
| 154 | 'term': QueryNode.TERM, 'not': QueryNode.NOT, |
|---|
| 155 | 'or': QueryNode.OR, 'attr': QueryNode.ATTR, |
|---|
| 156 | 'startsub': QueryNode.BEGINSUB, 'endsub': QueryNode.ENDSUB} |
|---|
| 157 | |
|---|
| 158 | def __init__(self, phrase, attribute_handlers=None): |
|---|
| 159 | """Construct a new Query. |
|---|
| 160 | |
|---|
| 161 | Attribute handlers are callables with the signature |
|---|
| 162 | (attribute_name, node, context) where node is the QueryNode |
|---|
| 163 | representing the RHS of the attribute expression and context is a custom |
|---|
| 164 | parameter passed to Query.__call__(). |
|---|
| 165 | |
|---|
| 166 | :param phrase: Query phrase. |
|---|
| 167 | :param attribute_handlers: A dictionary of attribute handlers. |
|---|
| 168 | """ |
|---|
| 169 | QueryNode.__init__(self, None) |
|---|
| 170 | tokens = self._tokenise(phrase) |
|---|
| 171 | root = self.parse(tokens) |
|---|
| 172 | self.phrase = phrase |
|---|
| 173 | self._compiled = None |
|---|
| 174 | self.attribute_handlers = attribute_handlers or {} |
|---|
| 175 | self.attribute_handlers.setdefault('*', self._invalid_handler) |
|---|
| 176 | if root: |
|---|
| 177 | # Make ourselves into the root node |
|---|
| 178 | for k in self.__slots__: |
|---|
| 179 | setattr(self, k, getattr(root, k)) |
|---|
| 180 | |
|---|
| 181 | def parse(self, tokens): |
|---|
| 182 | left = self.parse_unary(tokens) |
|---|
| 183 | while tokens: |
|---|
| 184 | if tokens[0][0] == QueryNode.ENDSUB: |
|---|
| 185 | return left |
|---|
| 186 | if tokens[0][0] == QueryNode.OR: |
|---|
| 187 | tokens.pop(0) |
|---|
| 188 | return QueryNode(QueryNode.OR, left=left, |
|---|
| 189 | right=self.parse(tokens)) |
|---|
| 190 | elif tokens[0][0] == QueryNode.ATTR: |
|---|
| 191 | tokens.pop(0) |
|---|
| 192 | if left.type is not QueryNode.TERM: |
|---|
| 193 | raise InvalidQuery(_("Attribute must be a word")) |
|---|
| 194 | left = QueryNode(QueryNode.ATTR, left=left, |
|---|
| 195 | right=self.parse_unary(tokens)) |
|---|
| 196 | else: |
|---|
| 197 | return QueryNode(QueryNode.AND, left=left, |
|---|
| 198 | right=self.parse(tokens)) |
|---|
| 199 | return left |
|---|
| 200 | |
|---|
| 201 | def parse_unary(self, tokens): |
|---|
| 202 | """Parse a unary operator. Currently only NOT. |
|---|
| 203 | |
|---|
| 204 | >>> q = Query('') |
|---|
| 205 | >>> q.parse_unary(q._tokenise('-foo')) |
|---|
| 206 | (not |
|---|
| 207 | ("foo") |
|---|
| 208 | nil) |
|---|
| 209 | """ |
|---|
| 210 | if not tokens: |
|---|
| 211 | return None |
|---|
| 212 | if tokens[0][0] == QueryNode.BEGINSUB: |
|---|
| 213 | tokens.pop(0) |
|---|
| 214 | if tokens[0][0] == QueryNode.ENDSUB: |
|---|
| 215 | return None |
|---|
| 216 | node = self.parse(tokens) |
|---|
| 217 | if not tokens or tokens[0][0] != QueryNode.ENDSUB: |
|---|
| 218 | raise InvalidQuery(_("Expected ) at end of sub-expression")) |
|---|
| 219 | tokens.pop(0) |
|---|
| 220 | return node |
|---|
| 221 | if tokens[0][0] == QueryNode.NOT: |
|---|
| 222 | tokens.pop(0) |
|---|
| 223 | return QueryNode(QueryNode.NOT, left=self.parse_terminal(tokens)) |
|---|
| 224 | return self.parse_terminal(tokens) |
|---|
| 225 | |
|---|
| 226 | def parse_terminal(self, tokens): |
|---|
| 227 | """Parse a terminal token. |
|---|
| 228 | |
|---|
| 229 | >>> q = Query('') |
|---|
| 230 | >>> q.parse_terminal(q._tokenise('foo')) |
|---|
| 231 | ("foo") |
|---|
| 232 | """ |
|---|
| 233 | |
|---|
| 234 | if not tokens: |
|---|
| 235 | raise InvalidQuery(_("Unexpected end of string")) |
|---|
| 236 | if tokens[0][0] in (QueryNode.TERM, QueryNode.OR): |
|---|
| 237 | token = tokens.pop(0)[1] |
|---|
| 238 | if token[0] in ('"', "'"): |
|---|
| 239 | token = re.sub(r'\\(.)', r'\1', token[1:-1]) |
|---|
| 240 | return QueryNode(QueryNode.TERM, value=token) |
|---|
| 241 | raise InvalidQuery(_("Expected terminal, got '%s'") % tokens[0][1]) |
|---|
| 242 | |
|---|
| 243 | def terms(self, exclude_not=True): |
|---|
| 244 | """A generator returning the terms contained in the Query. |
|---|
| 245 | |
|---|
| 246 | >>> q = Query('foo -bar or baz') |
|---|
| 247 | >>> list(q.terms()) |
|---|
| 248 | ['foo', 'baz'] |
|---|
| 249 | >>> list(q.terms(exclude_not=False)) |
|---|
| 250 | ['foo', 'bar', 'baz'] |
|---|
| 251 | """ |
|---|
| 252 | def _convert(node): |
|---|
| 253 | if not node: |
|---|
| 254 | return |
|---|
| 255 | if node.type == node.TERM: |
|---|
| 256 | yield node.value |
|---|
| 257 | elif node.type == node.ATTR: |
|---|
| 258 | return |
|---|
| 259 | elif node.type == node.NOT and exclude_not: |
|---|
| 260 | return |
|---|
| 261 | else: |
|---|
| 262 | for child in _convert(node.left): |
|---|
| 263 | yield child |
|---|
| 264 | for child in _convert(node.right): |
|---|
| 265 | yield child |
|---|
| 266 | |
|---|
| 267 | return _convert(self) |
|---|
| 268 | |
|---|
| 269 | def __call__(self, terms, context=None): |
|---|
| 270 | """Match the query against a sequence of terms.""" |
|---|
| 271 | return self.match(self, terms, context) |
|---|
| 272 | |
|---|
| 273 | def match(self, node, terms, context=None): |
|---|
| 274 | """Match a node against a set of terms.""" |
|---|
| 275 | def _match(node): |
|---|
| 276 | if not node or node.type == node.NULL: |
|---|
| 277 | return True |
|---|
| 278 | elif node.type == node.TERM: |
|---|
| 279 | return node.value in terms |
|---|
| 280 | elif node.type == node.AND: |
|---|
| 281 | return _match(node.left) and _match(node.right) |
|---|
| 282 | elif node.type == node.OR: |
|---|
| 283 | return _match(node.left) or _match(node.right) |
|---|
| 284 | elif node.type == node.NOT: |
|---|
| 285 | return not _match(node.left) |
|---|
| 286 | elif node.type == node.ATTR: |
|---|
| 287 | return self.attribute_handlers.get( |
|---|
| 288 | node.left.value, |
|---|
| 289 | self.attribute_handlers['*'] |
|---|
| 290 | )(node.left.value, node.right, context) |
|---|
| 291 | elif node.type is None: |
|---|
| 292 | return True |
|---|
| 293 | else: |
|---|
| 294 | raise NotImplementedError(node.type) |
|---|
| 295 | return _match(node) |
|---|
| 296 | |
|---|
| 297 | |
|---|
| 298 | def _compile_call(self, text, attribute_handlers=None): |
|---|
| 299 | import compiler |
|---|
| 300 | import types |
|---|
| 301 | from compiler import ast, misc, pycodegen |
|---|
| 302 | |
|---|
| 303 | raise NotImplementedError('Incomplete') |
|---|
| 304 | |
|---|
| 305 | # TODO Make this work? |
|---|
| 306 | def _generate(node): |
|---|
| 307 | if node.type == node.TERM: |
|---|
| 308 | return ast.Compare(ast.Const(node.value), |
|---|
| 309 | [('in', ast.Name('text'))]) |
|---|
| 310 | elif node.type == node.AND: |
|---|
| 311 | return ast.And([_generate(node.left), _generate(node.right)]) |
|---|
| 312 | elif node.type == node.OR: |
|---|
| 313 | return ast.Or([_generate(node.left), _generate(node.right)]) |
|---|
| 314 | elif node.type == node.NOT: |
|---|
| 315 | return ast.Not(_generate(node.left)) |
|---|
| 316 | elif node.type == node.ATTR: |
|---|
| 317 | raise NotImplementedError |
|---|
| 318 | |
|---|
| 319 | qast = ast.Expression(ast.Lambda(['self', 'text', |
|---|
| 320 | 'attribute_handlers'], |
|---|
| 321 | [ast.Name('None')], |
|---|
| 322 | 0, |
|---|
| 323 | _generate(self))) |
|---|
| 324 | misc.set_filename('<%s compiled query>' % self.__class__.__name__, |
|---|
| 325 | qast) |
|---|
| 326 | gen = pycodegen.ExpressionCodeGenerator(qast) |
|---|
| 327 | self.__call__ = types.MethodType(eval(gen.getCode()), self, Query) |
|---|
| 328 | return self.__call__(text) |
|---|
| 329 | |
|---|
| 330 | def as_string(self, and_=' AND ', or_=' OR ', not_='NOT '): |
|---|
| 331 | """Convert Query to a boolean expression. Useful for indexers with |
|---|
| 332 | "typical" boolean query syntaxes. |
|---|
| 333 | |
|---|
| 334 | eg. "term AND term OR term AND NOT term" |
|---|
| 335 | |
|---|
| 336 | The expanded operators can be customised for syntactical variations. |
|---|
| 337 | |
|---|
| 338 | >>> Query('foo bar').as_string() |
|---|
| 339 | 'foo AND bar' |
|---|
| 340 | >>> Query('foo bar or baz').as_string() |
|---|
| 341 | 'foo AND bar OR baz' |
|---|
| 342 | >>> Query('foo -bar or baz').as_string() |
|---|
| 343 | 'foo AND NOT bar OR baz' |
|---|
| 344 | """ |
|---|
| 345 | def _convert(node): |
|---|
| 346 | if not node or not node.type or node.type == node.NULL: |
|---|
| 347 | return '' |
|---|
| 348 | if node.type == node.AND: |
|---|
| 349 | return '%s%s%s' % (_convert(node.left), and_, |
|---|
| 350 | _convert(node.right)) |
|---|
| 351 | elif node.type == node.OR: |
|---|
| 352 | return '%s%s%s' % (_convert(node.left), or_, |
|---|
| 353 | _convert(node.right)) |
|---|
| 354 | elif node.type == node.NOT: |
|---|
| 355 | return '%s%s' % (not_, _convert(node.left)) |
|---|
| 356 | elif node.type == node.TERM: |
|---|
| 357 | return node.value |
|---|
| 358 | elif node.type == node.ATTR: |
|---|
| 359 | return '%s:%s' % (_convert(node.left), _convert(node.right)) |
|---|
| 360 | else: |
|---|
| 361 | raise NotImplementedError |
|---|
| 362 | return _convert(self) |
|---|
| 363 | |
|---|
| 364 | def as_sql(self, col_name): |
|---|
| 365 | """Convert Query to a SQL expression. |
|---|
| 366 | |
|---|
| 367 | The table column name must be given as argument. |
|---|
| 368 | |
|---|
| 369 | >>> Query('foo bar').as_sql('c') |
|---|
| 370 | "c='foo' AND c='bar'" |
|---|
| 371 | >>> Query('foo bar or baz').as_sql('c') |
|---|
| 372 | "c='foo' AND c='bar' OR c='baz'" |
|---|
| 373 | >>> Query('foo -bar or baz').as_sql('c') |
|---|
| 374 | "c='foo' AND NOT c='bar' OR c='baz'" |
|---|
| 375 | """ |
|---|
| 376 | def _convert(node): |
|---|
| 377 | if not node or not node.type or node.type == node.NULL: |
|---|
| 378 | return '' |
|---|
| 379 | if node.type == node.AND: |
|---|
| 380 | return '%s AND %s' % (_convert(node.left), |
|---|
| 381 | _convert(node.right)) |
|---|
| 382 | elif node.type == node.OR: |
|---|
| 383 | return '%s OR %s' % (_convert(node.left), |
|---|
| 384 | _convert(node.right)) |
|---|
| 385 | elif node.type == node.NOT: |
|---|
| 386 | return 'NOT %s' % _convert(node.left) |
|---|
| 387 | elif node.type == node.TERM: |
|---|
| 388 | return "%s='%s'" % (col_name, node.value) |
|---|
| 389 | elif node.type == node.ATTR: |
|---|
| 390 | raise NotImplementedError |
|---|
| 391 | else: |
|---|
| 392 | raise NotImplementedError |
|---|
| 393 | return _convert(self) |
|---|
| 394 | |
|---|
| 395 | def reduce(self, reduce): |
|---|
| 396 | """Pass each TERM node through `Reducer`.""" |
|---|
| 397 | def _reduce(node): |
|---|
| 398 | if not node: |
|---|
| 399 | return |
|---|
| 400 | if node.type == node.TERM: |
|---|
| 401 | node.value = reduce(node.value, unique=False, split=False) |
|---|
| 402 | _reduce(node.left) |
|---|
| 403 | _reduce(node.right) |
|---|
| 404 | _reduce(self) |
|---|
| 405 | |
|---|
| 406 | # Internal methods |
|---|
| 407 | def _tokenise(self, phrase): |
|---|
| 408 | """Tokenise a phrase string. |
|---|
| 409 | |
|---|
| 410 | >>> q = Query('') |
|---|
| 411 | >>> q._tokenise('one') |
|---|
| 412 | [(1, 'one')] |
|---|
| 413 | >>> q._tokenise('one two') |
|---|
| 414 | [(1, 'one'), (1, 'two')] |
|---|
| 415 | >>> q._tokenise('one or two') |
|---|
| 416 | [(1, 'one'), (4, 'or'), (1, 'two')] |
|---|
| 417 | >>> q._tokenise('"one two"') |
|---|
| 418 | [(1, '"one two"')] |
|---|
| 419 | >>> q._tokenise("'one two'") |
|---|
| 420 | [(1, "'one two'")] |
|---|
| 421 | >>> q._tokenise('-one') |
|---|
| 422 | [(2, '-'), (1, 'one')] |
|---|
| 423 | """ |
|---|
| 424 | tokens = [(self._group_map[token.lastgroup], token.group(token.lastindex)) |
|---|
| 425 | for token in self._tokenise_re.finditer(phrase)] |
|---|
| 426 | return tokens |
|---|
| 427 | |
|---|
| 428 | def _invalid_handler(self, name, node, context): |
|---|
| 429 | raise InvalidQuery(_("Invalid attribute '%s'") % name) |
|---|
| 430 | |
|---|
| 431 | if __name__ == '__main__': |
|---|
| 432 | import doctest |
|---|
| 433 | doctest.testmod() |
|---|