1 """CSS Selectors based on XPath.
2
3 This module supports selecting XML/HTML tags based on CSS selectors.
4 See the `CSSSelector` class for details.
5 """
6
7 import re
8 from lxml import etree
9
10 __all__ = ['SelectorSyntaxError', 'ExpressionError',
11 'CSSSelector']
12
13 try:
14 _basestring = basestring
15 except NameError:
16 _basestring = str
17
20
23
25 """A CSS selector.
26
27 Usage::
28
29 >>> from lxml import etree, cssselect
30 >>> select = cssselect.CSSSelector("a tag > child")
31
32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
33 >>> [ el.tag for el in select(root) ]
34 ['child']
35
36 To use CSS namespaces, you need to pass a prefix-to-namespace
37 mapping as ``namespaces`` keyword argument::
38
39 >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
40 >>> select_ns = cssselect.CSSSelector('root > rdf|Description',
41 ... namespaces={'rdf': rdfns})
42
43 >>> rdf = etree.XML((
44 ... '<root xmlns:rdf="%s">'
45 ... '<rdf:Description>blah</rdf:Description>'
46 ... '</root>') % rdfns)
47 >>> [(el.tag, el.text) for el in select_ns(rdf)]
48 [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
49 """
50 - def __init__(self, css, namespaces=None):
54
56 return '<%s %s for %r>' % (
57 self.__class__.__name__,
58 hex(abs(id(self)))[2:],
59 self.css)
60
61
62
63
64 try:
65 _unicode = unicode
66 _unichr = unichr
67 except NameError:
68
69 _unicode = str
70 _unichr = chr
71
74 obj = _unicode.__new__(cls, contents)
75 obj.pos = pos
76 return obj
77
79 return '%s(%s, %r)' % (
80 self.__class__.__name__,
81 _unicode.__repr__(self),
82 self.pos)
83
86
89
92
93
94
95
96
97
98
99
101 """
102 Represents selector.class_name
103 """
104
105 - def __init__(self, selector, class_name):
106 self.selector = selector
107 self.class_name = class_name
108
110 return '%s[%r.%s]' % (
111 self.__class__.__name__,
112 self.selector,
113 self.class_name)
114
116 sel_xpath = self.selector.xpath()
117 sel_xpath.add_condition(
118 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' '))
119 return sel_xpath
120
122 """
123 Represents selector:name(expr)
124 """
125
126 unsupported = [
127 'target', 'lang', 'enabled', 'disabled',]
128
129 - def __init__(self, selector, type, name, expr):
130 self.selector = selector
131 self.type = type
132 self.name = name
133 self.expr = expr
134
136 return '%s[%r%s%s(%r)]' % (
137 self.__class__.__name__,
138 self.selector,
139 self.type, self.name, self.expr)
140
152
155 a, b = parse_series(expr)
156 if not a and not b and not last:
157
158 xpath.add_condition('false() and position() = 0')
159 return xpath
160 if add_name_test:
161 xpath.add_name_test()
162 xpath.add_star_prefix()
163 if a == 0:
164 if last:
165 b = 'last() - %s' % b
166 xpath.add_condition('position() = %s' % b)
167 return xpath
168 if last:
169
170 a = -a
171 b = -b
172 if b > 0:
173 b_neg = str(-b)
174 else:
175 b_neg = '+%s' % (-b)
176 if a != 1:
177 expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
178 else:
179 expr = []
180 if b >= 0:
181 expr.append('position() >= %s' % b)
182 elif b < 0 and last:
183 expr.append('position() < (last() %s)' % b)
184 expr = ' and '.join(expr)
185 if expr:
186 xpath.add_condition(expr)
187 return xpath
188
189
190
191
192
193
194
195
198
204
207
216
224
227
228 ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
229 ns.prefix = 'css'
230 ns['lower-case'] = _make_lower_case
231
233 """
234 Represents selector:ident
235 """
236
237 unsupported = ['indeterminate', 'first-line', 'first-letter',
238 'selection', 'before', 'after', 'link', 'visited',
239 'active', 'focus', 'hover']
240
241 - def __init__(self, element, type, ident):
242 self.element = element
243 assert type in (':', '::')
244 self.type = type
245 self.ident = ident
246
248 return '%s[%r%s%s]' % (
249 self.__class__.__name__,
250 self.element,
251 self.type, self.ident)
252
265
267
268 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
269 return xpath
270
272
273 raise NotImplementedError
274
280
286
294
302
308
310 if xpath.element == '*':
311 raise NotImplementedError(
312 "*:only-of-type is not implemented")
313 xpath.add_condition('last() = 1')
314 return xpath
315
319
321 """
322 Represents selector[namespace|attrib operator value]
323 """
324
325 - def __init__(self, selector, namespace, attrib, operator, value):
331
333 if self.operator == 'exists':
334 return '%s[%r[%s]]' % (
335 self.__class__.__name__,
336 self.selector,
337 self._format_attrib())
338 else:
339 return '%s[%r[%s %s %r]]' % (
340 self.__class__.__name__,
341 self.selector,
342 self._format_attrib(),
343 self.operator,
344 self.value)
345
351
358
360 path = self.selector.xpath()
361 attrib = self._xpath_attrib()
362 value = self.value
363 if self.operator == 'exists':
364 assert not value
365 path.add_condition(attrib)
366 elif self.operator == '=':
367 path.add_condition('%s = %s' % (attrib,
368 xpath_literal(value)))
369 elif self.operator == '!=':
370
371 if value:
372 path.add_condition('not(%s) or %s != %s'
373 % (attrib, attrib, xpath_literal(value)))
374 else:
375 path.add_condition('%s != %s'
376 % (attrib, xpath_literal(value)))
377
378 elif self.operator == '~=':
379 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
380 elif self.operator == '|=':
381
382 path.add_condition('%s = %s or starts-with(%s, %s)' % (
383 attrib, xpath_literal(value),
384 attrib, xpath_literal(value + '-')))
385 elif self.operator == '^=':
386 path.add_condition('starts-with(%s, %s)' % (
387 attrib, xpath_literal(value)))
388 elif self.operator == '$=':
389
390 path.add_condition('substring(%s, string-length(%s)-%s) = %s'
391 % (attrib, attrib, len(value)-1, xpath_literal(value)))
392 elif self.operator == '*=':
393
394 path.add_condition('contains(%s, %s)' % (
395 attrib, xpath_literal(value)))
396 else:
397 assert 0, ("Unknown operator: %r" % self.operator)
398 return path
399
401 """
402 Represents namespace|element
403 """
404
405 - def __init__(self, namespace, element):
408
410 return '%s[%s]' % (
411 self.__class__.__name__,
412 self._format_element())
413
419
421 if self.namespace == '*':
422 el = self.element.lower()
423 else:
424
425 el = '%s:%s' % (self.namespace, self.element)
426 return XPathExpr(element=el)
427
429 """
430 Represents selector#id
431 """
432
434 self.selector = selector
435 self.id = id
436
438 return '%s[%r#%s]' % (
439 self.__class__.__name__,
440 self.selector, self.id)
441
446
448
452 return '%s(%r)' % (
453 self.__class__.__name__,
454 self.items)
455
459
461
462 _method_mapping = {
463 ' ': 'descendant',
464 '>': 'child',
465 '+': 'direct_adjacent',
466 '~': 'indirect_adjacent',
467 }
468
469 - def __init__(self, selector, combinator, subselector):
470 assert selector is not None
471 self.selector = selector
472 self.combinator = combinator
473 self.subselector = subselector
474
476 if self.combinator == ' ':
477 comb = '<followed>'
478 else:
479 comb = self.combinator
480 return '%s[%r %s %r]' % (
481 self.__class__.__name__,
482 self.selector,
483 comb,
484 self.subselector)
485
494
499
504
511
516
517
518
519
520 _el_re = re.compile(r'^\w+\s*$', re.UNICODE)
521 _id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
522 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
523
525 if isinstance(css_expr, _basestring):
526 match = _el_re.search(css_expr)
527 if match is not None:
528 return '%s%s' % (prefix, match.group(0).strip())
529 match = _id_re.search(css_expr)
530 if match is not None:
531 return "%s%s[@id = '%s']" % (
532 prefix, match.group(1) or '*', match.group(2))
533 match = _class_re.search(css_expr)
534 if match is not None:
535 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
536 prefix, match.group(1) or '*', match.group(2))
537 css_expr = parse(css_expr)
538 expr = css_expr.xpath()
539 assert expr is not None, (
540 "Got None for xpath expression from %s" % repr(css_expr))
541 if prefix:
542 expr.add_prefix(prefix)
543 return _unicode(expr)
544
546
547 - def __init__(self, prefix=None, path=None, element='*', condition=None,
548 star_prefix=False):
549 self.prefix = prefix
550 self.path = path
551 self.element = element
552 self.condition = condition
553 self.star_prefix = star_prefix
554
556 path = ''
557 if self.prefix is not None:
558 path += _unicode(self.prefix)
559 if self.path is not None:
560 path += _unicode(self.path)
561 path += _unicode(self.element)
562 if self.condition:
563 path += '[%s]' % self.condition
564 return path
565
567 return '%s[%s]' % (
568 self.__class__.__name__, self)
569
571 if self.condition:
572 self.condition = '%s and (%s)' % (self.condition, condition)
573 else:
574 self.condition = condition
575
577 if self.path is None:
578 self.path = self.element
579 else:
580 self.path += self.element
581 self.element = part
582
588
590 if self.element == '*':
591
592 return
593 self.add_condition("name() = %s" % xpath_literal(self.element))
594 self.element = '*'
595
597 """
598 Adds a /* prefix if there is no prefix. This is when you need
599 to keep context's constrained to a single parent.
600 """
601 if self.path:
602 self.path += '*/'
603 else:
604 self.path = '*/'
605 self.star_prefix = True
606
607 - def join(self, combiner, other):
608 prefix = _unicode(self)
609 prefix += combiner
610 path = (other.prefix or '') + (other.path or '')
611
612
613 if other.star_prefix and path == '*/':
614 path = ''
615 self.prefix = prefix
616 self.path = path
617 self.element = other.element
618 self.condition = other.condition
619
621 """
622 Represents |'d expressions. Note that unfortunately it isn't
623 the union, it's the sum, so duplicate elements will appear.
624 """
625
626 - def __init__(self, items, prefix=None):
631
635
636 split_at_single_quotes = re.compile("('+)").split
637
639 if isinstance(s, Element):
640
641 s = s._format_element()
642 else:
643 s = _unicode(s)
644 if "'" not in s:
645 s = "'%s'" % s
646 elif '"' not in s:
647 s = '"%s"' % s
648 else:
649 s = "concat(%s)" % ','.join([
650 (("'" in part) and '"%s"' or "'%s'") % part
651 for part in split_at_single_quotes(s) if part
652 ])
653 return s
654
655
656
657
673
675 result = []
676 while 1:
677 result.append(parse_selector(stream))
678 if stream.peek() == ',':
679 stream.next()
680 else:
681 break
682 if len(result) == 1:
683 return result[0]
684 else:
685 return Or(result)
686
705
707 peek = stream.peek()
708 if peek != '*' and not isinstance(peek, Symbol):
709 element = namespace = '*'
710 else:
711 next = stream.next()
712 if next != '*' and not isinstance(next, Symbol):
713 raise SelectorSyntaxError(
714 "Expected symbol, got '%s'" % next)
715 if stream.peek() == '|':
716 namespace = next
717 stream.next()
718 element = stream.next()
719 if element != '*' and not isinstance(next, Symbol):
720 raise SelectorSyntaxError(
721 "Expected symbol, got '%s'" % next)
722 else:
723 namespace = '*'
724 element = next
725 result = Element(namespace, element)
726 has_hash = False
727 while 1:
728 peek = stream.peek()
729 if peek == '#':
730 if has_hash:
731
732
733 break
734 stream.next()
735 result = Hash(result, stream.next())
736 has_hash = True
737 continue
738 elif peek == '.':
739 stream.next()
740 result = Class(result, stream.next())
741 continue
742 elif peek == '[':
743 stream.next()
744 result = parse_attrib(result, stream)
745 next = stream.next()
746 if not next == ']':
747 raise SelectorSyntaxError(
748 "] expected, got '%s'" % next)
749 continue
750 elif peek == ':' or peek == '::':
751 type = stream.next()
752 ident = stream.next()
753 if not isinstance(ident, Symbol):
754 raise SelectorSyntaxError(
755 "Expected symbol, got '%s'" % ident)
756 if stream.peek() == '(':
757 stream.next()
758 peek = stream.peek()
759 if isinstance(peek, String):
760 selector = stream.next()
761 elif isinstance(peek, Symbol) and is_int(peek):
762 selector = int(stream.next())
763 else:
764
765 selector = parse_simple_selector(stream)
766 next = stream.next()
767 if not next == ')':
768 raise SelectorSyntaxError(
769 "Expected ')', got '%s' and '%s'"
770 % (next, selector))
771 result = Function(result, type, ident, selector)
772 else:
773 result = Pseudo(result, type, ident)
774 continue
775 else:
776 if peek == ' ':
777 stream.next()
778 break
779
780 return result
781
783 try:
784 int(v)
785 except ValueError:
786 return False
787 else:
788 return True
789
791 attrib = stream.next()
792 if stream.peek() == '|':
793 namespace = attrib
794 stream.next()
795 attrib = stream.next()
796 else:
797 namespace = '*'
798 if stream.peek() == ']':
799 return Attrib(selector, namespace, attrib, 'exists', None)
800 op = stream.next()
801 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
802 raise SelectorSyntaxError(
803 "Operator expected, got '%s'" % op)
804 value = stream.next()
805 if not isinstance(value, (Symbol, String)):
806 raise SelectorSyntaxError(
807 "Expected string or symbol, got '%s'" % value)
808 return Attrib(selector, namespace, attrib, op, value)
809
811 """
812 Parses things like '1n+2', or 'an+b' generally, returning (a, b)
813 """
814 if isinstance(s, Element):
815 s = s._format_element()
816 if not s or s == '*':
817
818 return (0, 0)
819 if isinstance(s, int):
820
821 return (0, s)
822 if s == 'odd':
823 return (2, 1)
824 elif s == 'even':
825 return (2, 0)
826 elif s == 'n':
827 return (1, 0)
828 if 'n' not in s:
829
830 return (0, int(s))
831 a, b = s.split('n', 1)
832 if not a:
833 a = 1
834 elif a == '-' or a == '+':
835 a = int(a+'1')
836 else:
837 a = int(a)
838 if not b:
839 b = 0
840 elif b == '-' or b == '+':
841 b = int(b+'1')
842 else:
843 b = int(b)
844 return (a, b)
845
846
847
848
849
850
851 _match_whitespace = re.compile(r'\s+', re.UNICODE).match
852
853 _replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub
854
855 _match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match
856
858 pos = 0
859 s = _replace_comments('', s)
860 while 1:
861 match = _match_whitespace(s, pos=pos)
862 if match:
863 preceding_whitespace_pos = pos
864 pos = match.end()
865 else:
866 preceding_whitespace_pos = 0
867 if pos >= len(s):
868 return
869 match = _match_count_number(s, pos=pos)
870 if match and match.group() != 'n':
871 sym = s[pos:match.end()]
872 yield Symbol(sym, pos)
873 pos = match.end()
874 continue
875 c = s[pos]
876 c2 = s[pos:pos+2]
877 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
878 yield Token(c2, pos)
879 pos += 2
880 continue
881 if c in '>+~,.*=[]()|:#':
882 if c in '.#[' and preceding_whitespace_pos > 0:
883 yield Token(' ', preceding_whitespace_pos)
884 yield Token(c, pos)
885 pos += 1
886 continue
887 if c == '"' or c == "'":
888
889 old_pos = pos
890 sym, pos = tokenize_escaped_string(s, pos)
891 yield String(sym, old_pos)
892 continue
893 old_pos = pos
894 sym, pos = tokenize_symbol(s, pos)
895 yield Symbol(sym, old_pos)
896 continue
897
898 split_at_string_escapes = re.compile(r'(\\(?:%s))'
899 % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
900 '[^A-Fa-f0-9]'])).split
901
903 substrings = []
904 for substring in split_at_string_escapes(literal):
905 if not substring:
906 continue
907 elif '\\' in substring:
908 if substring[0] == '\\' and len(substring) > 1:
909 substring = substring[1:]
910 if substring[0] in '0123456789ABCDEFabcdef':
911
912 substring = _unichr(int(substring, 16))
913 else:
914 raise SelectorSyntaxError(
915 "Invalid escape sequence %r in string %r"
916 % (substring.split('\\')[1], literal))
917 substrings.append(substring)
918 return ''.join(substrings)
919
921 quote = s[pos]
922 assert quote in ('"', "'")
923 pos = pos+1
924 start = pos
925 while 1:
926 next = s.find(quote, pos)
927 if next == -1:
928 raise SelectorSyntaxError(
929 "Expected closing %s for string in: %r"
930 % (quote, s[start:]))
931 result = s[start:next]
932 if result.endswith('\\'):
933
934 pos = next+1
935 continue
936 if '\\' in result:
937 result = unescape_string_literal(result)
938 return result, next+1
939
940 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
941
943 start = pos
944 match = _illegal_symbol.search(s, pos=pos)
945 if not match:
946
947 return s[start:], len(s)
948 if match.start() == pos:
949 assert 0, (
950 "Unexpected symbol: %r at %s" % (s[pos], pos))
951 if not match:
952 result = s[start:]
953 pos = len(s)
954 else:
955 result = s[start:match.start()]
956 pos = match.start()
957 try:
958 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
959 except UnicodeDecodeError:
960 import sys
961 e = sys.exc_info()[1]
962 raise SelectorSyntaxError(
963 "Bad symbol %r: %s" % (result, e))
964 return result, pos
965
967
968 - def __init__(self, tokens, source=None):
969 self.used = []
970 self.tokens = iter(tokens)
971 self.source = source
972 self.peeked = None
973 self._peeking = False
974 try:
975 self.next_token = self.tokens.next
976 except AttributeError:
977
978 self.next_token = self.tokens.__next__
979
981 if self._peeking:
982 self._peeking = False
983 self.used.append(self.peeked)
984 return self.peeked
985 else:
986 try:
987 next = self.next_token()
988 self.used.append(next)
989 return next
990 except StopIteration:
991 return None
992
995
997 if not self._peeking:
998 try:
999 self.peeked = self.next_token()
1000 except StopIteration:
1001 return None
1002 self._peeking = True
1003 return self.peeked
1004