| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 | # |
|---|
| 5 | # This software is licensed as described in the file COPYING, which |
|---|
| 6 | # you should have received as part of this distribution. |
|---|
| 7 | # |
|---|
| 8 | |
|---|
| 9 | """ |
|---|
| 10 | A generalised query parser and matcher. |
|---|
| 11 | """ |
|---|
| 12 | |
|---|
| 13 | import re |
|---|
| 14 | from trac.core import TracError |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | __all__ = ['Query', 'InvalidQuery'] |
|---|
| 18 | |
|---|
| 19 | |
|---|
| 20 | class InvalidQuery(TracError): |
|---|
| 21 | """Raised when a query is invalid.""" |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | class QueryNode(object): |
|---|
| 25 | """A query parse node. |
|---|
| 26 | |
|---|
| 27 | >>> QueryNode(QueryNode.TERM, 'one') |
|---|
| 28 | ("one") |
|---|
| 29 | >>> QueryNode(QueryNode.AND, |
|---|
| 30 | ... left=QueryNode(QueryNode.TERM, 'one'), |
|---|
| 31 | ... right=QueryNode(QueryNode.TERM, 'two')) |
|---|
| 32 | (and |
|---|
| 33 | ("one") |
|---|
| 34 | ("two")) |
|---|
| 35 | >>> QueryNode(QueryNode.NOT, left=QueryNode(QueryNode.TERM, 'one')) |
|---|
| 36 | (not |
|---|
| 37 | ("one") |
|---|
| 38 | nil) |
|---|
| 39 | """ |
|---|
| 40 | |
|---|
| 41 | |
|---|
| 42 | # TODO Separate lexer and parser identifiers |
|---|
| 43 | NULL = 0 |
|---|
| 44 | TERM = 1 |
|---|
| 45 | NOT = 2 |
|---|
| 46 | AND = 3 |
|---|
| 47 | OR = 4 |
|---|
| 48 | ATTR = 5 |
|---|
| 49 | BEGINSUB = 6 |
|---|
| 50 | ENDSUB = 7 |
|---|
| 51 | |
|---|
| 52 | __slots__ = ('type', 'value', 'left', 'right') |
|---|
| 53 | |
|---|
| 54 | _type_map = {None: 'null', NULL: 'null', TERM: 'term', NOT: 'not', AND: |
|---|
| 55 | 'and', OR: 'or', ATTR: 'attr'} |
|---|
| 56 | |
|---|
| 57 | def __init__(self, type, value=None, left=None, right=None): |
|---|
| 58 | self.type = type |
|---|
| 59 | self.value = value |
|---|
| 60 | self.left = left |
|---|
| 61 | self.right = right |
|---|
| 62 | |
|---|
| 63 | def __repr__(self): |
|---|
| 64 | def show(node, depth=0): |
|---|
| 65 | if node.type == QueryNode.TERM: |
|---|
| 66 | text = u'%s("%s"' % (u' ' * depth, node.value) |
|---|
| 67 | elif node.type == QueryNode.ATTR: |
|---|
| 68 | text = u'%s(attr' % (u' ' * depth,) |
|---|
| 69 | else: |
|---|
| 70 | text = u'%s(%s%s' % (u' ' * depth, self._type_map[node.type], |
|---|
| 71 | node.value and u' "%s"' % (node.value,) or |
|---|
| 72 | u'') |
|---|
| 73 | if node.left or node.right: |
|---|
| 74 | text += u'\n' |
|---|
| 75 | if node.left: |
|---|
| 76 | text += show(node.left, depth + 1) |
|---|
| 77 | else: |
|---|
| 78 | text += u'%snil' % (u' ' * (depth + 1)) |
|---|
| 79 | text += u'\n' |
|---|
| 80 | if node.right: |
|---|
| 81 | text += show(node.right, depth + 1) |
|---|
| 82 | else: |
|---|
| 83 | text += u'%snil' % (u' ' * (depth + 1)) |
|---|
| 84 | text += u')' |
|---|
| 85 | return text |
|---|
| 86 | return show(self).encode('utf-8') |
|---|
| 87 | |
|---|
| 88 | |
|---|
| 89 | class Query(QueryNode): |
|---|
| 90 | """Query parser. |
|---|
| 91 | |
|---|
| 92 | Converts a simple query language into a parse tree which Indexers can then |
|---|
| 93 | convert into their own implementation-specific representation. |
|---|
| 94 | |
|---|
| 95 | The query language is in the following form: |
|---|
| 96 | |
|---|
| 97 | <term> <term> Document must contain all of these terms. |
|---|
| 98 | "some term" Return documents matching this exact phrase. |
|---|
| 99 | -<term> Exclude documents containing this term. |
|---|
| 100 | <term> or <term> Return documents matching either term. |
|---|
| 101 | |
|---|
| 102 | <attr>:<term> Customisable attribute matching. |
|---|
| 103 | |
|---|
| 104 | eg. |
|---|
| 105 | |
|---|
| 106 | >>> Query('lettuce tomato -cheese') |
|---|
| 107 | (and |
|---|
| 108 | ("lettuce") |
|---|
| 109 | (and |
|---|
| 110 | ("tomato") |
|---|
| 111 | (not |
|---|
| 112 | ("cheese") |
|---|
| 113 | nil))) |
|---|
| 114 | |
|---|
| 115 | >>> Query('"mint slices" -timtams') |
|---|
| 116 | (and |
|---|
| 117 | ("mint slices") |
|---|
| 118 | (not |
|---|
| 119 | ("timtams") |
|---|
| 120 | nil)) |
|---|
| 121 | |
|---|
| 122 | >>> Query('"brie cheese" or "camembert cheese"') |
|---|
| 123 | (or |
|---|
| 124 | ("brie cheese") |
|---|
| 125 | ("camembert cheese")) |
|---|
| 126 | |
|---|
| 127 | >>> Query('type:(soft or hard) (brie or camembert or cheddar)') |
|---|
| 128 | (and |
|---|
| 129 | (attr |
|---|
| 130 | ("type") |
|---|
| 131 | (or |
|---|
| 132 | ("soft") |
|---|
| 133 | ("hard"))) |
|---|
| 134 | (or |
|---|
| 135 | ("brie") |
|---|
| 136 | (or |
|---|
| 137 | ("camembert") |
|---|
| 138 | ("cheddar")))) |
|---|
| 139 | """ |
|---|
| 140 | |
|---|
| 141 | _tokenise_re = re.compile(r""" |
|---|
| 142 | (?P<not>-)| |
|---|
| 143 | (?P<or>or)| |
|---|
| 144 | \"(?P<dquote>(?:\\.|[^\"])*)\"| |
|---|
| 145 | '(?P<squote>(?:\\.|[^'])*)'| |
|---|
| 146 | (?P<startsub>\()| |
|---|
| 147 | (?P<endsub>\))| |
|---|
| 148 | (?P<attr>:)| |
|---|
| 149 | (?P<term>[^:()"'\s]+)""", re.UNICODE | re.IGNORECASE | re.VERBOSE) |
|---|
| 150 | |
|---|
| 151 | _group_map = {'dquote': QueryNode.TERM, 'squote': QueryNode.TERM, |
|---|
| 152 | 'term': QueryNode.TERM, 'not': QueryNode.NOT, |
|---|
| 153 | 'or': QueryNode.OR, 'attr': QueryNode.ATTR, |
|---|
| 154 | 'startsub': QueryNode.BEGINSUB, 'endsub': QueryNode.ENDSUB} |
|---|
| 155 | |
|---|
| 156 | def __init__(self, phrase, attribute_handlers=None): |
|---|
| 157 | """Construct a new Query. |
|---|
| 158 | |
|---|
| 159 | Attribute handlers are callables with the signature |
|---|
| 160 | (attribute_name, node, context) where node is the QueryNode |
|---|
| 161 | representing the RHS of the attribute expression and context is a custom |
|---|
| 162 | parameter passed to Query.__call__(). |
|---|
| 163 | |
|---|
| 164 | :param phrase: Query phrase. |
|---|
| 165 | :param attribute_handlers: A dictionary of attribute handlers. |
|---|
| 166 | """ |
|---|
| 167 | QueryNode.__init__(self, None) |
|---|
| 168 | tokens = self._tokenise(phrase) |
|---|
| 169 | root = self.parse(tokens) |
|---|
| 170 | self.phrase = phrase |
|---|
| 171 | self._compiled = None |
|---|
| 172 | self.attribute_handlers = attribute_handlers or {} |
|---|
| 173 | self.attribute_handlers.setdefault('*', self._invalid_handler) |
|---|
| 174 | if root: |
|---|
| 175 | # Make ourselves into the root node |
|---|
| 176 | for k in self.__slots__: |
|---|
| 177 | setattr(self, k, getattr(root, k)) |
|---|
| 178 | |
|---|
| 179 | def parse(self, tokens): |
|---|
| 180 | left = self.parse_unary(tokens) |
|---|
| 181 | while tokens: |
|---|
| 182 | if tokens[0][0] == QueryNode.ENDSUB: |
|---|
| 183 | return left |
|---|
| 184 | if tokens[0][0] == QueryNode.OR: |
|---|
| 185 | tokens.pop(0) |
|---|
| 186 | return QueryNode(QueryNode.OR, left=left, |
|---|
| 187 | right=self.parse(tokens)) |
|---|
| 188 | elif tokens[0][0] == QueryNode.ATTR: |
|---|
| 189 | tokens.pop(0) |
|---|
| 190 | if left.type is not QueryNode.TERM: |
|---|
| 191 | raise InvalidQuery('Attribute must be a word') |
|---|
| 192 | left = QueryNode(QueryNode.ATTR, left=left, |
|---|
| 193 | right=self.parse_unary(tokens)) |
|---|
| 194 | else: |
|---|
| 195 | return QueryNode(QueryNode.AND, left=left, |
|---|
| 196 | right=self.parse(tokens)) |
|---|
| 197 | return left |
|---|
| 198 | |
|---|
| 199 | def parse_unary(self, tokens): |
|---|
| 200 | """Parse a unary operator. Currently only NOT. |
|---|
| 201 | |
|---|
| 202 | >>> q = Query('') |
|---|
| 203 | >>> q.parse_unary(q._tokenise('-foo')) |
|---|
| 204 | (not |
|---|
| 205 | ("foo") |
|---|
| 206 | nil) |
|---|
| 207 | """ |
|---|
| 208 | if not tokens: |
|---|
| 209 | return None |
|---|
| 210 | if tokens[0][0] == QueryNode.BEGINSUB: |
|---|
| 211 | tokens.pop(0) |
|---|
| 212 | if tokens[0][0] == QueryNode.ENDSUB: |
|---|
| 213 | return None |
|---|
| 214 | node = self.parse(tokens) |
|---|
| 215 | if not tokens or tokens[0][0] != QueryNode.ENDSUB: |
|---|
| 216 | raise InvalidQuery('Expected ) at end of sub-expression') |
|---|
| 217 | tokens.pop(0) |
|---|
| 218 | return node |
|---|
| 219 | if tokens[0][0] == QueryNode.NOT: |
|---|
| 220 | tokens.pop(0) |
|---|
| 221 | return QueryNode(QueryNode.NOT, left=self.parse_terminal(tokens)) |
|---|
| 222 | return self.parse_terminal(tokens) |
|---|
| 223 | |
|---|
| 224 | def parse_terminal(self, tokens): |
|---|
| 225 | """Parse a terminal token. |
|---|
| 226 | |
|---|
| 227 | >>> q = Query('') |
|---|
| 228 | >>> q.parse_terminal(q._tokenise('foo')) |
|---|
| 229 | ("foo") |
|---|
| 230 | """ |
|---|
| 231 | |
|---|
| 232 | if not tokens: |
|---|
| 233 | raise InvalidQuery('Unexpected end of string') |
|---|
| 234 | if tokens[0][0] in (QueryNode.TERM, QueryNode.OR): |
|---|
| 235 | token = tokens.pop(0) |
|---|
| 236 | return QueryNode(QueryNode.TERM, value=token[1]) |
|---|
| 237 | raise InvalidQuery('Expected terminal, got "%s"' % tokens[0][1]) |
|---|
| 238 | |
|---|
| 239 | def terms(self, exclude_not=True): |
|---|
| 240 | """A generator returning the terms contained in the Query. |
|---|
| 241 | |
|---|
| 242 | >>> q = Query('foo -bar or baz') |
|---|
| 243 | >>> list(q.terms()) |
|---|
| 244 | ['foo', 'baz'] |
|---|
| 245 | >>> list(q.terms(exclude_not=False)) |
|---|
| 246 | ['foo', 'bar', 'baz'] |
|---|
| 247 | """ |
|---|
| 248 | def _convert(node): |
|---|
| 249 | if not node: |
|---|
| 250 | return |
|---|
| 251 | if node.type == node.TERM: |
|---|
| 252 | yield node.value |
|---|
| 253 | elif node.type == node.ATTR: |
|---|
| 254 | return |
|---|
| 255 | elif node.type == node.NOT and exclude_not: |
|---|
| 256 | return |
|---|
| 257 | else: |
|---|
| 258 | for child in _convert(node.left): |
|---|
| 259 | yield child |
|---|
| 260 | for child in _convert(node.right): |
|---|
| 261 | yield child |
|---|
| 262 | |
|---|
| 263 | return _convert(self) |
|---|
| 264 | |
|---|
| 265 | def __call__(self, terms, context=None): |
|---|
| 266 | """Match the query against a sequence of terms.""" |
|---|
| 267 | return self.match(self, terms, context) |
|---|
| 268 | |
|---|
| 269 | def match(self, node, terms, context=None): |
|---|
| 270 | """Match a node against a set of terms.""" |
|---|
| 271 | def _match(node): |
|---|
| 272 | if not node or node.type == node.NULL: |
|---|
| 273 | return True |
|---|
| 274 | elif node.type == node.TERM: |
|---|
| 275 | return node.value in terms |
|---|
| 276 | elif node.type == node.AND: |
|---|
| 277 | return _match(node.left) and _match(node.right) |
|---|
| 278 | elif node.type == node.OR: |
|---|
| 279 | return _match(node.left) or _match(node.right) |
|---|
| 280 | elif node.type == node.NOT: |
|---|
| 281 | return not _match(node.left) |
|---|
| 282 | elif node.type == node.ATTR: |
|---|
| 283 | return self.attribute_handlers.get( |
|---|
| 284 | node.left.value, |
|---|
| 285 | self.attribute_handlers['*'] |
|---|
| 286 | )(node.left.value, node.right, context) |
|---|
| 287 | elif node.type is None: |
|---|
| 288 | return True |
|---|
| 289 | else: |
|---|
| 290 | raise NotImplementedError(node.type) |
|---|
| 291 | return _match(node) |
|---|
| 292 | |
|---|
| 293 | |
|---|
| 294 | def _compile_call(self, text, attribute_handlers=None): |
|---|
| 295 | import compiler |
|---|
| 296 | import types |
|---|
| 297 | from compiler import ast, misc, pycodegen |
|---|
| 298 | |
|---|
| 299 | raise NotImplementedError('Incomplete') |
|---|
| 300 | |
|---|
| 301 | # TODO Make this work? |
|---|
| 302 | def _generate(node): |
|---|
| 303 | if node.type == node.TERM: |
|---|
| 304 | return ast.Compare(ast.Const(node.value), |
|---|
| 305 | [('in', ast.Name('text'))]) |
|---|
| 306 | elif node.type == node.AND: |
|---|
| 307 | return ast.And([_generate(node.left), _generate(node.right)]) |
|---|
| 308 | elif node.type == node.OR: |
|---|
| 309 | return ast.Or([_generate(node.left), _generate(node.right)]) |
|---|
| 310 | elif node.type == node.NOT: |
|---|
| 311 | return ast.Not(_generate(node.left)) |
|---|
| 312 | elif node.type == node.ATTR: |
|---|
| 313 | raise NotImplementedError |
|---|
| 314 | |
|---|
| 315 | qast = ast.Expression(ast.Lambda(['self', 'text', |
|---|
| 316 | 'attribute_handlers'], |
|---|
| 317 | [ast.Name('None')], |
|---|
| 318 | 0, |
|---|
| 319 | _generate(self))) |
|---|
| 320 | misc.set_filename('<%s compiled query>' % self.__class__.__name__, |
|---|
| 321 | qast) |
|---|
| 322 | gen = pycodegen.ExpressionCodeGenerator(qast) |
|---|
| 323 | self.__call__ = types.MethodType(eval(gen.getCode()), self, Query) |
|---|
| 324 | return self.__call__(text) |
|---|
| 325 | |
|---|
| 326 | def as_string(self, and_=' AND ', or_=' OR ', not_='NOT '): |
|---|
| 327 | """Convert Query to a boolean expression. Useful for indexers with |
|---|
| 328 | "typical" boolean query syntaxes. |
|---|
| 329 | |
|---|
| 330 | eg. "term AND term OR term AND NOT term" |
|---|
| 331 | |
|---|
| 332 | The expanded operators can be customised for syntactical variations. |
|---|
| 333 | |
|---|
| 334 | >>> Query('foo bar').as_string() |
|---|
| 335 | 'foo AND bar' |
|---|
| 336 | >>> Query('foo bar or baz').as_string() |
|---|
| 337 | 'foo AND bar OR baz' |
|---|
| 338 | >>> Query('foo -bar or baz').as_string() |
|---|
| 339 | 'foo AND NOT bar OR baz' |
|---|
| 340 | """ |
|---|
| 341 | def _convert(node): |
|---|
| 342 | if not node or node.type == node.NULL: |
|---|
| 343 | return '' |
|---|
| 344 | if node.type == node.AND: |
|---|
| 345 | return '%s%s%s' % (_convert(node.left), and_, |
|---|
| 346 | _convert(node.right)) |
|---|
| 347 | elif node.type == node.OR: |
|---|
| 348 | return '%s%s%s' % (_convert(node.left), or_, |
|---|
| 349 | _convert(node.right)) |
|---|
| 350 | elif node.type == node.NOT: |
|---|
| 351 | return '%s%s' % (not_, _convert(node.left)) |
|---|
| 352 | elif node.type == node.TERM: |
|---|
| 353 | return node.value |
|---|
| 354 | elif node.type == node.ATTR: |
|---|
| 355 | return '%s:%s' % (_convert(node.left), _convert(node.right)) |
|---|
| 356 | else: |
|---|
| 357 | raise NotImplementedError |
|---|
| 358 | return _convert(self) |
|---|
| 359 | |
|---|
| 360 | def reduce(self, reduce): |
|---|
| 361 | """Pass each TERM node through `Reducer`.""" |
|---|
| 362 | def _reduce(node): |
|---|
| 363 | if not node: |
|---|
| 364 | return |
|---|
| 365 | if node.type == node.TERM: |
|---|
| 366 | node.value = reduce(node.value, unique=False, split=False) |
|---|
| 367 | _reduce(node.left) |
|---|
| 368 | _reduce(node.right) |
|---|
| 369 | _reduce(self) |
|---|
| 370 | |
|---|
| 371 | # Internal methods |
|---|
| 372 | def _tokenise(self, phrase): |
|---|
| 373 | """Tokenise a phrase string. |
|---|
| 374 | |
|---|
| 375 | >>> q = Query('') |
|---|
| 376 | >>> q._tokenise('one') |
|---|
| 377 | [(1, 'one')] |
|---|
| 378 | >>> q._tokenise('one two') |
|---|
| 379 | [(1, 'one'), (1, 'two')] |
|---|
| 380 | >>> q._tokenise('one or two') |
|---|
| 381 | [(1, 'one'), (4, 'or'), (1, 'two')] |
|---|
| 382 | >>> q._tokenise('"one two"') |
|---|
| 383 | [(1, 'one two')] |
|---|
| 384 | >>> q._tokenise("'one two'") |
|---|
| 385 | [(1, 'one two')] |
|---|
| 386 | >>> q._tokenise('-one') |
|---|
| 387 | [(2, '-'), (1, 'one')] |
|---|
| 388 | """ |
|---|
| 389 | tokens = [(self._group_map[token.lastgroup], token.group(token.lastindex)) |
|---|
| 390 | for token in self._tokenise_re.finditer(phrase)] |
|---|
| 391 | return tokens |
|---|
| 392 | |
|---|
| 393 | def _invalid_handler(self, name, node, context): |
|---|
| 394 | raise InvalidQuery('Invalid attribute "%s"' % name) |
|---|
| 395 | |
|---|
| 396 | if __name__ == '__main__': |
|---|
| 397 | import doctest |
|---|
| 398 | doctest.testmod() |
|---|