root/livinglogic.python.xist/src/ll/xist/xfind.py @ 2831:580cb3f32382

Revision 2831:580cb3f32382, 59.6 KB (checked in by Walter Doerwald <walter@…>, 12 years ago)

Add a method cssweight() to all selectors.

This methods calculates the specificity of an CSS selector as specified
here: http://www.w3.org/TR/css3-selectors/#specificity

Line 
1# -*- coding: iso-8859-1 -*-
2
3## Copyright 1999-2007 by LivingLogic AG, Bayreuth/Germany.
4## Copyright 1999-2007 by Walter Dörwald
5##
6## All Rights Reserved
7##
8## See xist/__init__.py for the license
9
10
11"""
12<par>This module contains XFind and CSS selectors and related classes and functions.</par>
13
14<par>A selector is a &xist; tree traversal filter that traverses the complete
15&xml; tree and outputs those nodes specified by the selector. Selectors can
16be combined with various operations and form a language comparable to
17<link href="http://www.w3.org/TR/xpath">XPath</link> but implemented as Python
18expressions. The following code shows some
19examples. First lets define some support code:</par>
20
21<example><title>Support code (put in <filename>help.py</filename>)</title>
22<prog>
23from ll.xist import xsc, xfind, parsers
24from ll.xist.ns import html
25
26node = parsers.parseURL("http://www.python.org", tidy=True)
27
28def output(selector):
29    for n in node.walknode(selector):
30        print n.bytes()
31</prog>
32</example>
33
34<par>We can now use this code in a Python session via <lit>from help import *</lit>.</par>
35
36<prog>
37<prompt>>>> </prompt><input>output(html.a/html.img) # images children of a elements</input>
38<![CDATA[<img src="/images/python-logo.gif" alt="homepage" id="logo" border="0" />
39<img id="skiptonav" alt="skip to navigation" src="/images/trans.gif" border="0" />
40<img id="skiptocontent" alt="skip to content" src="/images/trans.gif" border="0" />
41<img alt="success story photo" class="success" src="/images/success/nasa.jpg" />]]>
42
43<prompt>>>> </prompt><input>output(html.ul//html.a) # a descendants of ul elements</input>
44<![CDATA[<a title="About The Python Language" href="/about/">About</a>
45<a title="Major Happenings Within the Python Community" href="/news/">News</a>
46<a title="Tutorials, Library Reference, C API" href="/doc/">Documentation</a>]]>
47
48<prompt>>>> </prompt><input>output(html.img & xfind.attrendswith("src", ".jpg")) # JPEG images</input>
49<![CDATA[<img alt="success story photo" class="success" src="/images/success/nasa.jpg" />]]>
50
51<prompt>>>> </prompt><input>output(html.img & ~xfind.hasattr("title")) # All images without a title attribute</input>
52<![CDATA[<img src="/images/python-logo.gif" border="0" id="logo" alt="homepage" />
53<img id="skiptonav" border="0" src="/images/trans.gif" alt="skip to navigation" />
54<img id="skiptocontent" border="0" src="/images/trans.gif" alt="skip to content" />
55<img alt="success story photo" src="/images/success/nasa.jpg" class="success" />]]>
56
57<prompt>>>> </prompt><input>output(html.a & xfind.hasclass("reference")) # Links with 'reference' class</input>
58<![CDATA[<a class="reference" href="/search">Advanced Search</a>
59<a href="about/success/rackspace" class="reference">Rackspace</a>
60<a href="about/success/ilm" class="reference">Industrial Light and Magic</a>]]>
61
62<prompt>>>> </prompt><input>output(html.ul/html.li[0]) # Every li element that is the first li child of its ul parent</input>
63<![CDATA[<li>
64          <a title="About The Python Language" href="/about/">About</a>
65        </li>
66<li><a title="Manuals for Latest Stable Release" href="http://docs.python.org/">Documentation</a></li>
67<li class="group"><a href="http://wiki.python.org/moin/WebProgramming">Web Programming</a></li>]]>
68
69</prog>
70"""
71
72__version__ = "$Revision$".split()[1]
73# $Source$
74
75
76try:
77    import cssutils
78    from cssutils.css import cssstylerule
79    from cssutils.css import selector as cssselector
80    from cssutils.css import cssnamespacerule
81except ImportError:
82    pass
83
84from ll import misc
85from ll.xist import xsc
86
87
88class CSSWeight(tuple):
89    """
90    The specificity of a &CSS; selector as a 3-item tuple as specified by
91    <link href="http://www.w3.org/TR/css3-selectors/#specificity">CSS3</link>.
92    """
93
94    def __new__(cls, a=0, b=0, c=0):
95        return tuple.__new__(cls, (a, b, c))
96
97    def __add__(self, other):
98        return CSSWeight(self[0]+other[0], self[1]+other[1], self[2]+other[2])
99
100    def __repr__(self):
101        return "CSSWeight(%r, %r, %r)" % (self[0], self[1], self[2])
102
103
104class Selector(xsc.WalkFilter):
105    """
106    Base class for all tree traversal filters that visit the complete tree.
107    Whether a node gets output can be specified by overwriting the
108    <method>match</method> method. Selectors can be combined with various
109    operations (see methods below).
110    """
111
112    @misc.notimplemented
113    def match(self, path):
114        pass
115
116    def filter(self, path):
117        return (True, xsc.entercontent, xsc.enterattrs) if self.match(path) else (xsc.entercontent, xsc.enterattrs)
118
119    def __div__(self, other):
120        return ChildCombinator(self, xsc.makewalkfilter(other))
121
122    def __floordiv__(self, other):
123        return DescendantCombinator(self, xsc.makewalkfilter(other))
124
125    def __mul__(self, other):
126        return AdjacentSiblingCombinator(self, xsc.makewalkfilter(other))
127
128    def __pow__(self, other):
129        return GeneralSiblingCombinator(self, xsc.makewalkfilter(other))
130
131    def __and__(self, other):
132        return AndCombinator(self, xsc.makewalkfilter(other))
133
134    def __or__(self, other):
135        return OrCombinator(self, xsc.makewalkfilter(other))
136
137    def __invert__(self):
138        return NotCombinator(self)
139
140    def cssweight(self):
141        """
142        Return the &CSS; specificity of <self/> as a
143        <pyref class="CSSWeight"><class>CSSWeight</class></pyref> object.
144        """
145        return CSSWeight()
146
147
148class IsInstanceSelector(Selector):
149    """
150    <par>Selector that selects all nodes that are instances of the specified type.
151    You can either create an <class>IsInstanceSelector</class> object directly
152    or simply pass a class to a function that expects a walk filter.</par>
153
154    <example>
155    <tty>
156    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
157    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
158    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
159    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.a</em>):</input>
160    <prompt>... </prompt><input>\tprint node.bytes()</input>
161    <prompt>... </prompt><input></input>
162    <![CDATA[<a id="logolink" accesskey="1" href="http://www.python.org/"><img src="http://www.python.org/images/python-logo.gif" id="logo" border="0" alt="homepage" /></a>
163    <a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" src="http://www.python.org/images/trans.gif" border="0" alt="skip to navigation" /></a>
164    <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" src="http://www.python.org/images/trans.gif" border="0" alt="skip to content" /></a>
165    <a class="reference" href="http://www.python.org/search">Advanced Search</a>
166    <a title="About The Python Language" href="http://www.python.org/about/">About</a>]]>
167    <rep>...</rep>
168    </tty>
169    </example>
170    """
171    def __init__(self, *types):
172        self.types = types
173
174    def match(self, path):
175        if path:
176            return isinstance(path[-1], self.types)
177        return False
178
179    def __or__(self, other):
180        # If other is a type check too, combine self and other into one isinstance instance
181        if isinstance(other, xsc._Node_Meta):
182            return IsInstanceSelector(*(self.types + (other,)))
183        elif isinstance(other, IsInstanceSelector):
184            return IsInstanceSelector(*(self.types+other.types))
185        return Selector.__or__(self, other)
186
187    def __getitem__(self, index):
188        return nthoftype(index, *self.types)
189
190    def __str__(self):
191        if len(self.types) == 1:
192            return "%s.%s" % (self.types[0].__module__, self.types[0].__name__)
193        else:
194            return "(%s)" % " | ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types)
195
196
197class hasname(Selector):
198    """
199    <par>Selector that selects all nodes that have a specified Python name (which
200    only selects elements, processing instructions and entities). Also a namespace
201    name can be specified as a second argument, which will only select elements
202    from the specified namespace.</par>
203
204    <example>
205    <tty>
206    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
207    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
208    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasname("img")</em>):</input>
209    <prompt>... </prompt><input>\tprint node.bytes()</input>
210    <prompt>... </prompt><input></input>
211    <![CDATA[<img border="0" src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" />
212    <img border="0" id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" />
213    <img border="0" id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" />
214    <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
215    </tty>
216    </example>
217    """
218    def __init__(self, name, xmlns=None):
219        self.name = name
220        self.xmlns = xsc.nsname(xmlns)
221
222    def match(self, path):
223        if path:
224            node = path[-1]
225            if self.xmlns is not None:
226                return isinstance(node, xsc.Element) and node.__class__.__name__ == self.name and node.xmlns == self.xmlns
227            else:
228                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.__class__.__name__ == self.name
229        return False
230
231    def __str__(self):
232        return "%s(%r)" % (self.__class__.__name__, self.name)
233
234
235class hasname_xml(Selector):
236    """
237    <class>hasname_xml</class> works similar to <pyref class="hasname"><class>hasname</class></pyref>
238    except that the specified name is treated as the &xml; name, not the Python name.
239    """
240    def __init__(self, name, xmlns=None):
241        self.name = name
242        self.xmlns = xsc.nsname(xmlns)
243
244    def match(self, path):
245        if path:
246            node = path[-1]
247            if self.xmlns is not None:
248                return isinstance(node, xsc.Element) and node.xmlname == self.name and node.xmlns == self.xmlns
249            else:
250                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.xmlname == self.name
251        return False
252
253    def __str__(self):
254        return "%s(%r)" % (self.__class__.__name__, self.name)
255
256
257class IsSelector(Selector):
258    """
259    <par>Selector that selects one specific node in the tree. This can be
260    combined with other selectors via <pyref class="ChildCombinator"><class>ChildCombinator</class>s</pyref>
261    or <pyref class="DescendantCombinator"><class>DescendantCombinator</class>s</pyref>
262    to select children of this specific node. You can either create an
263    <class>IsSelector</class> directly or simply pass a node to a function that
264    expects a walk filter.</par>
265
266    <example>
267    <tty>
268    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
269    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
270    <prompt>>>> </prompt><input>for node in doc.walknode(<em>doc[0]/xsc.Element</em>):</input>
271    <prompt>... </prompt><input>\tprint repr(node)</input>
272    <prompt>... </prompt><input></input>
273    <![CDATA[<ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6c82f4c>
274    <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6c3154c>]]>
275    </tty>
276    </example>
277    """
278    def __init__(self, node):
279        self.node = node
280
281    def match(self, path):
282        return path and path[-1] is self.node
283
284    def __str__(self):
285        return "%s(%r)" % (self.__class__.__name__, self.node)
286
287
288class isroot(Selector):
289    def match(self, path):
290        return len(path) == 1
291
292    def __str__(self):
293        return "isroot"
294
295
296isroot = isroot()
297
298
299class empty(Selector):
300    """
301    <par>Selector that selects all empty elements or fragments.</par>
302
303    <example>
304    <tty>
305    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
306    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
307    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.empty</em>):</input>
308    <prompt>... </prompt><input>\tprint node.bytes()</input>
309    <prompt>... </prompt><input></input>
310    <![CDATA[<meta content="text/html; charset=utf-8" http-equiv="content-type" />
311    <meta content="python programming language object oriented web free source" name="keywords" />
312    <meta content="      Home page for Python, an interpreted, interactive, object-oriented, extensible
313          programming language. It provides an extraordinary combination of clarity and
314          versatility, and is free and comprehensively ported. " name="description" />
315    <link type="application/rss+xml" href="http://www.python.org/channews.rdf" rel="alternate" title="RSS" />]]>
316    <rep>...</rep>
317    </tty>
318    </example>
319    """
320
321    def match(self, path):
322        if path:
323            node = path[-1]
324            if isinstance(node, (xsc.Element, xsc.Frag)):
325                return len(node) == 0
326        return False
327
328    def __str__(self):
329        return "empty"
330
331
332empty = empty()
333
334
335class onlychild(Selector):
336    """
337    <par>Selector that selects all node that are the only child of their parents.</par>
338
339    <example>
340    <tty>
341    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
342    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
343    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.onlychild &amp; html.a</em>):</input>
344    <prompt>... </prompt><input>\tprint node.bytes()</input>
345    <prompt>... </prompt><input></input>
346    <![CDATA[<a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" /></a>
347    <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" /></a>
348    <a href="http://www.python.org/download/releases/2.5.1">Quick Links (2.5.1)</a>
349    <a title="Manuals for Latest Stable Release" href="http://docs.python.org/">Documentation</a>]]>
350    <rep>...</rep>
351    </tty>
352    </example>
353    """
354
355    def match(self, path):
356        if len(path) >= 2:
357            parent = path[-2]
358            if isinstance(parent, (xsc.Frag, xsc.Element)):
359                return len(parent)==1 and parent[0] is path[-1]
360        return False
361
362    def __str__(self):
363        return "onlychild"
364
365
366onlychild = onlychild()
367
368
369class onlyoftype(Selector):
370    """
371    <par>Selector that selects all nodes that are the only nodes of their type among
372    their siblings.</par>
373
374    <example>
375    <tty>
376    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
377    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
378    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.onlyoftype &amp; xsc.Element</em>):</input>
379    <prompt>... </prompt><input>\tprint repr(node)</input>
380    <prompt>... </prompt><input></input>
381    <![CDATA[<ll.xist.ns.html.html element object (2 children/1 attr) (from http://www.python.org/:4:?) at 0xb6d6e7ec>
382    <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6cc1f8c>
383    <ll.xist.ns.html.title element object (1 child/no attrs) (from http://www.python.org/:8:?) at 0xb6d79b8c>
384    <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6d7282c>]]>
385    <rep>...</rep>
386    </tty>
387    </example>
388    """
389
390    def match(self, path):
391        if len(path) >= 2:
392            node = path[-1]
393            parent = path[-2]
394            if isinstance(parent, (xsc.Frag, xsc.Element)):
395                for child in parent:
396                    if isinstance(child, node.__class__):
397                        if child is not node:
398                            return False
399                return True
400        return False
401
402    def __str__(self):
403        return "onlyoftype"
404
405
406onlyoftype = onlyoftype()
407
408
409class hasattr(Selector):
410    """
411    <par>Selector that selects all element nodes that have an attribute with one
412    of the specified Python names. For selecting nodes with global attributes
413    the attribute class can be passed.</par>
414
415    <example>
416    <tty>
417    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
418    <prompt>>>> </prompt><input>from ll.xist.ns import html, xml</input>
419    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
420    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.onlyoftype &amp; xsc.Element</em>):</input>
421    <prompt>... </prompt><input>\tprint repr(node)</input>
422    <prompt>... </prompt><input></input>
423    <![CDATA[<ll.xist.ns.html.html element object (2 children/1 attr) (from http://www.python.org/:4:?) at 0xb6d6e7ec>
424    <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6cc1f8c>
425    <ll.xist.ns.html.title element object (1 child/no attrs) (from http://www.python.org/:8:?) at 0xb6d79b8c>
426    <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6d7282c>]]>
427    <rep>...</rep>
428    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasattr(xml.Attrs.lang</em>):</input>
429    <prompt>... </prompt><input>\tprint repr(node)</input>
430    <prompt>... </prompt><input></input>
431    <![CDATA[<ll.xist.ns.html.html element object (2 children/2 attrs) (from http://www.python.org/:4:?) at 0xb6d71d4c>]]>
432    </tty>
433    </example>
434    """
435
436    def __init__(self, *attrnames):
437        self.attrnames = attrnames
438
439    def match(self, path):
440        if path:
441            node = path[-1]
442            if isinstance(node, xsc.Element):
443                for attrname in self.attrnames:
444                    if node.Attrs.isallowed(attrname) and node.attrs.has(attrname):
445                        return True
446        return False
447
448    def __str__(self):
449        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
450
451
452class hasattr_xml(Selector):
453    """
454    <class>hasattr_xml</class> works similar to <pyref class="hasattr"><class>hasattr</class></pyref>
455    except that the specified names are treated as &xml; names instead of Python names.
456    """
457
458    def __init__(self, *attrnames):
459        self.attrnames = attrnames
460
461    def match(self, path):
462        if path:
463            node = path[-1]
464            if isinstance(node, xsc.Element):
465                for attrname in self.attrnames:
466                    if node.Attrs.isallowed_xml(attrname) and node.attrs.has_xml(attrname):
467                        return True
468        return False
469
470    def __str__(self):
471        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
472
473
474class attrhasvalue(Selector):
475    """
476    <par>Selector that selects all element nodes where an attribute with the
477    specified Python name has the specified value. For global attributes
478    the attribute class can be passed. Note that
479    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref> attributes
480    will not be considered.</par>
481
482    <example>
483    <tty>
484    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
485    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
486    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrhasvalue("rel", "stylesheet")</em>):</input>
487    <prompt>... </prompt><input>\tprint repr(node)</input>
488    <prompt>... </prompt><input></input>
489    <![CDATA[<link media="screen" type="text/css" href="http://www.python.org/styles/screen-switcher-default.css" rel="stylesheet" id="screen-switcher-stylesheet" />
490    <link media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
491    <link media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />]]>
492    </tty>
493    </example>
494    """
495
496    def __init__(self, attrname, attrvalue):
497        self.attrname = attrname
498        self.attrvalue = attrvalue
499
500    def match(self, path):
501        if path:
502            node = path[-1]
503            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
504                attr = node.attrs.get(self.attrname)
505                if not attr.isfancy(): # if there are PIs, say no
506                    return unicode(attr) == self.attrvalue
507        return False
508
509    def __str__(self):
510        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
511
512
513class attrhasvalue_xml(Selector):
514    """
515    <class>attrhasvalue_xml</class> works similar to <pyref class="attrhasvalue"><class>attrhasvalue</class></pyref>
516    except that the specified name is treated as an &xml; name instead of a Python name.
517    """
518
519    def __init__(self, attrname, attrvalue):
520        self.attrname = attrname
521        self.attrvalue = attrvalue
522
523    def match(self, path):
524        if path:
525            node = path[-1]
526            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
527                attr = node.attrs.get_xml(self.attrname)
528                if not attr.isfancy(): # if there are PIs, say no
529                    return unicode(attr) == self.attrvalue
530        return False
531
532    def __str__(self):
533        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
534
535
536class attrcontains(Selector):
537    """
538    <par>Selector that selects all element nodes where an attribute with the
539    specified Python name contains the specified subtring in its value. For
540    global attributes the attribute class can be passed. Note that
541    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref>
542    attributes will not be considered.</par>
543
544    <example>
545    <tty>
546    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
547    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
548    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrcontains("rel", "stylesheet")</em>):</input>
549    <prompt>... </prompt><input>\tprint repr(node)</input>
550    <prompt>... </prompt><input></input>
551    <![CDATA[<link type="text/css" id="screen-switcher-stylesheet" media="screen" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
552    <link type="text/css" media="scReen" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
553    <link type="text/css" media="print" rel="stylesheet" href="http://www.python.org/styles/print.css" />
554    <link type="text/css" title="large text" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
555    <link type="text/css" title="default fonts" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />]]>
556    </tty>
557    </example>
558    """
559
560    def __init__(self, attrname, attrvalue):
561        self.attrname = attrname
562        self.attrvalue = attrvalue
563
564    def match(self, path):
565        if path:
566            node = path[-1]
567            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
568                attr = node.attrs.get(self.attrname)
569                if not attr.isfancy(): # if there are PIs, say no
570                    return self.attrvalue in unicode(attr)
571        return False
572
573    def __str__(self):
574        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
575
576
577class attrcontains_xml(Selector):
578    """
579    <class>attrcontains_xml</class> works similar to <pyref class="attrcontains"><class>attrcontains</class></pyref>
580    except that the specified name is treated as an &xml; name instead of a Python name.
581    """
582
583    def __init__(self, attrname, attrvalue):
584        self.attrname = attrname
585        self.attrvalue = attrvalue
586
587    def match(self, path):
588        if path:
589            node = path[-1]
590            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
591                attr = node.attrs.get_xml(self.attrname)
592                if not attr.isfancy(): # if there are PIs, say no
593                    return self.attrvalue in unicode(attr)
594        return False
595
596    def __str__(self):
597        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
598
599
600class attrstartswith(Selector):
601    """
602    <par>Selector that selects all element nodes where an attribute with the
603    specified Python name starts with the specified string. For global attributes
604    the attribute class can be passed. Note that
605    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref> attributes
606    will not be considered.</par>
607
608    <example>
609    <tty>
610    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
611    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
612    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrstartswith("class_", "input-")</em>):</input>
613    <prompt>... </prompt><input>\tprint repr(node)</input>
614    <prompt>... </prompt><input></input>
615    <![CDATA[<input class="input-text" id="q" type="text" name="q" />
616    <input value="search" class="input-button" id="submit" type="submit" name="submit" />]]>
617    </tty>
618    </example>
619    """
620
621    def __init__(self, attrname, attrvalue):
622        self.attrname = attrname
623        self.attrvalue = attrvalue
624
625    def match(self, path):
626        if path:
627            node = path[-1]
628            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
629                attr = node.attrs.get(self.attrname)
630                if not attr.isfancy(): # if there are PIs, say no
631                    return unicode(attr).startswith(self.attrvalue)
632        return False
633
634    def __str__(self):
635        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
636
637
638class attrstartswith_xml(Selector):
639    """
640    <class>attrstartswith_xml</class> works similar to <pyref class="attrstartswith"><class>attrstartswith</class></pyref>
641    except that the specified name is treated as an &xml; name instead of a Python name.
642    """
643
644    def __init__(self, attrname, attrvalue):
645        self.attrname = attrname
646        self.attrvalue = attrvalue
647
648    def match(self, path):
649        if path:
650            node = path[-1]
651            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
652                attr = node.attrs.get_xml(self.attrname)
653                if not attr.isfancy(): # if there are PIs, say no
654                    return unicode(attr).startswith(self.attrvalue)
655        return False
656
657    def __str__(self):
658        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
659
660
661class attrendswith(Selector):
662    """
663    <par>Selector that selects all element nodes where an attribute with the
664    specified Python name ends with the specified string. For global attributes
665    the attribute class can be passed. Note that
666    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref> attributes
667    will not be considered.</par>
668
669    <example>
670    <tty>
671    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
672    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
673    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrendswith("href", ".css")</em>):</input>
674    <prompt>... </prompt><input>\tprint repr(node)</input>
675    <prompt>... </prompt><input></input>
676    <![CDATA[<link href="http://www.python.org/styles/screen-switcher-default.css" type="text/css" rel="stylesheet" id="screen-switcher-stylesheet" media="screen" />
677    <link type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" media="scReen" />
678    <link type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" media="print" />
679    <link title="large text" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" media="screen" />
680    <link title="default fonts" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" media="screen" />]]>
681    </tty>
682    </example>
683    """
684
685    def __init__(self, attrname, attrvalue):
686        self.attrname = attrname
687        self.attrvalue = attrvalue
688
689    def match(self, path):
690        if path:
691            node = path[-1]
692            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
693                attr = node.attrs.get(self.attrname)
694                if not attr.isfancy(): # if there are PIs, say no
695                    return unicode(attr).endswith(self.attrvalue)
696        return False
697
698    def __str__(self):
699        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
700
701
702class attrendswith_xml(Selector):
703    """
704    <class>attrendswith_xml</class> works similar to <pyref class="attrendswith"><class>attrendswith</class></pyref>
705    except that the specified name is treated as an &xml; name instead of a Python name.
706    """
707
708    def __init__(self, attrname, attrvalue):
709        self.attrname = attrname
710        self.attrvalue = attrvalue
711
712    def match(self, path):
713        if path:
714            node = path[-1]
715            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
716                attr = node.attrs.get_xml(self.attrname)
717                if not attr.isfancy(): # if there are PIs, say no
718                    return unicode(attr).endswith(self.attrvalue)
719        return False
720
721    def __str__(self):
722        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
723
724
725class hasid(Selector):
726    """
727    <par>Selector that selects all element nodes where the <lit>id</lit> attribute
728    has the specified value.</par>
729    <example>
730    <tty>
731    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
732    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
733    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasid("logo")</em>):</input>
734    <prompt>... </prompt><input>\tprint node.bytes()</input>
735    <prompt>... </prompt><input></input>
736    <![CDATA[<img src="http://www.python.org/images/python-logo.gif" id="logo" alt="homepage" border="0" />]]>
737    </tty>
738    </example>
739    """
740
741    def __init__(self, id):
742        self.id = id
743
744    def match(self, path):
745        if path:
746            node = path[-1]
747            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("id"):
748                attr = node.attrs.get_xml("id")
749                if not attr.isfancy():
750                    return unicode(attr) == self.id
751        return False
752
753    def __str__(self):
754        return "%s(%r)" % (self.__class__.__name__, self.id)
755
756    def cssweight(self):
757        return CSSWeight(1, 0, 0)
758
759
760class hasclass(Selector):
761    """
762    <par>Selector that selects all element nodes where the <lit>class</lit> attribute
763    has the specified value.</par>
764    <example>
765    <tty>
766    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
767    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
768    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasclass("reference")</em>):</input>
769    <prompt>... </prompt><input>\tprint node.bytes()</input>
770    <prompt>... </prompt><input></input>
771    <![CDATA[<a class="reference" href="http://www.python.org/search">Advanced Search</a>
772    <a href="http://www.python.org/about/success/rackspace" class="reference">Rackspace</a>
773    <a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
774    <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>]]>
775    <rep>...</rep>
776    </tty>
777    </example>
778    """
779
780    def __init__(self, classname):
781        self.classname = classname
782
783    def match(self, path):
784        if path:
785            node = path[-1]
786            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("class"):
787                attr = node.attrs.get_xml("class")
788                if not attr.isfancy():
789                    return self.classname in unicode(attr).split()
790        return False
791
792    def __str__(self):
793        return "%s(%r)" % (self.__class__.__name__, self.classname)
794
795    def cssweight(self):
796        return CSSWeight(0, 1, 0)
797
798
799class inattr(Selector):
800    """
801    <par>Selector that selects all attribute nodes and nodes inside of attributes.</par>
802    <example>
803    <tty>
804    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
805    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
806    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.inattr &amp; xsc.Text</em>):</input>
807    <prompt>... </prompt><input>\tprint node.bytes()</input>
808    <prompt>... </prompt><input></input>
809    text/html; charset=utf-8
810    content-type
811    python programming language object oriented web free source
812    <rep>...</rep>
813    </tty>
814    </example>
815    """
816    def match(self, path):
817        return any(isinstance(node, xsc.Attr) for node in path)
818
819    def __str__(self):
820        return "inattr"
821
822
823inattr = inattr()
824
825
826class Combinator(Selector):
827    """
828    <par>A <class>Combinator</class> is a selector that transforms one or combines
829    two or more other selectors in a certain way.</par>
830    """
831
832
833class BinaryCombinator(Combinator):
834    """
835    <par>A <class>BinaryCombinator</class> is a combinator that combines two selector:
836    the left hand selector and the right hand selector.</par>
837    """
838    symbol = None
839
840    def __init__(self, left, right):
841        self.left = left
842        self.right = right
843
844    def __str__(self):
845        left = str(self.left)
846        if isinstance(self.left, Combinator) and not isinstance(self.left, self.__class__):
847            left = "(%s)" % left
848        right = str(self.right)
849        if isinstance(self.right, Combinator) and not isinstance(self.right, self.__class__):
850            right = "(%s)" % right
851        return "%s%s%s" % (left, self.symbol, right)
852
853    def cssweight(self):
854        return self.left.cssweight()+self.right.cssweight()
855
856
857class ChildCombinator(BinaryCombinator):
858    """
859    <par>A <class>ChildCombinator</class> is a <class>BinaryCombinator</class>.
860    To match the <class>ChildCombinator</class> the node must match the
861    right hand selector and it's immediate parent must match the left hand
862    selector (i.e. it works similar to the <lit>&gt;</lit> combinator in &css;
863    or the <lit>/</lit> combinator in XPath.</par>
864
865    <par><class>ChildCombinator</class>s can be created via the division operator (<lit>/</lit>):</par>
866
867    <example>
868    <tty>
869    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
870    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
871    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
872    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.a/html.img</em>):</input>
873    <prompt>... </prompt><input>\tprint node.bytes()</input>
874    <prompt>... </prompt><input></input>
875    <![CDATA[<img src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" border="0" />
876    <img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
877    <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
878    <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
879    </tty>
880    </example>
881    """
882    def match(self, path):
883        if path and self.right.match(path):
884            return self.left.match(path[:-1])
885        return False
886
887    symbol = " / "
888
889
890class DescendantCombinator(BinaryCombinator):
891    """
892    <par>A <class>DescendantCombinator</class> is a <class>BinaryCombinator</class>.
893    To match the <class>DescendantCombinator</class> the node must match the
894    right hand selector and any of it's ancestor nodes must match the left hand
895    selector (i.e. it works similar to the descendant combinator in &css;
896    or the <lit>//</lit> combinator in XPath.</par>
897
898    <par><class>DescendantCombinator</class>s can be created via the floor division
899    operator (<lit>//</lit>):</par>
900
901    <example>
902    <tty>
903    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
904    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
905    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
906    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.div//html.img</em>):</input>
907    <prompt>... </prompt><input>\tprint node.bytes()</input>
908    <prompt>... </prompt><input></input>
909    <![CDATA[<img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
910    <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
911    <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
912    </tty>
913    </example>
914    """
915    def match(self, path):
916        if path and self.right.match(path):
917            while path:
918                path = path[:-1]
919                if self.left.match(path):
920                    return True
921        return False
922
923    symbol = " // "
924
925
926class AdjacentSiblingCombinator(BinaryCombinator):
927    """
928    <par>A <class>AdjacentSiblingCombinator</class> is a <class>BinaryCombinator</class>.
929    To match the <class>AdjacentSiblingCombinator</class> the node must match the
930    right hand selector and the immediately preceding sibling must match the left
931    hand selector.</par>
932
933    <par><class>AdjacentSiblingCombinator</class>s can be created via the
934    multiplication operator (<lit>*</lit>). The following example outputs all links
935    inside those <class>p</class> elements that immediately follow a
936    <class>h2</class> element:</par>
937
938    <example>
939    <tty>
940    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
941    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
942    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
943    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.h2*html.p/html.a</em>):</input>
944    <prompt>... </prompt><input>\tprint node.bytes()</input>
945    <prompt>... </prompt><input></input>
946    <![CDATA[<a href="http://www.scipy.org/SciPy2007" class="reference">SciPy Conference</a>
947    <a href="https://www.enthought.com/scipy07/" class="reference">early registration</a>
948    <a href="http://www.europython.org/sections/registration_issues/how-to-register" class="reference">Online registration</a>
949    <a href="http://europython.org/" class="reference">EuroPython 2007</a>
950    <a href="http://www.osdc.com.au/papers/cfp.html" class="reference">Call For Papers</a>
951    <a href="http://www.swa.hpi.uni-potsdam.de/dls07/" class="reference">DLS 2007</a>
952    <a href="http://pythonpapers.cgpublisher.com/" class="reference">The Python Papers</a>
953    <a href="http://www.pyconuk.org/" class="reference">PyCon UK</a>
954    <a href="http://www.pyconuk.org/submit.html" class="reference">proposals for talks</a>
955    <a href="http://www.pycon.it/registration/" class="reference">registration online</a>]]>
956    </tty>
957    </example>
958    """
959
960    def match(self, path):
961        if len(path) >= 2 and self.right.match(path):
962            # Find sibling
963            node = path[-1]
964            sibling = None
965            for child in path[-2]:
966                if child is node:
967                    break
968                sibling = child
969            if sibling is not None:
970                return self.left.match(path[:-1]+[sibling])
971        return False
972
973    symbol = " * "
974
975
976class GeneralSiblingCombinator(BinaryCombinator):
977    """
978    <par>A <class>GeneralSiblingCombinator</class> is a <class>BinaryCombinator</class>.
979    To match the <class>GeneralSiblingCombinator</class> the node must match the
980    right hand selector and any of the preceding siblings must match the left
981    hand selector.</par>
982
983    <par><class>AdjacentSiblingCombinator</class>s can be created via the
984    exponentiation operator (<lit>**</lit>). The following example outputs all links
985    that are not the first links inside their parent (i.e. they have another link
986    among their preceding siblings):</par>
987
988    <example>
989    <tty>
990    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
991    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
992    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
993    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.a**html.a</em>):</input>
994    <prompt>... </prompt><input>\tprint node.bytes()</input>
995    <prompt>... </prompt><input></input>
996    <![CDATA[<a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
997    <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
998    <a href="http://www.python.org/about/success/honeywell" class="reference">Honeywell</a>
999    <a href="http://www.python.org/about/success" class="reference">and many others</a>
1000    <a href="http://www.zope.org/">Zope</a>]]>
1001    <rep>...</rep>
1002    </tty>
1003    </example>
1004    """
1005
1006    def match(self, path):
1007        if len(path) >= 2 and self.right.match(path):
1008            node = path[-1]
1009            for child in path[-2]:
1010                if child is node: # no previous siblings
1011                    return False
1012                if self.left.match(path[:-1]+[child]):
1013                    return True
1014        return False
1015
1016    symbol = " ** "
1017
1018
1019class ChainedCombinator(Combinator):
1020    """
1021    <par>A <class>ChainedCombinator</class> combines any number of other
1022    selectors.</par>
1023    """
1024
1025    symbol = None
1026
1027    def __init__(self, *selectors):
1028        self.selectors = selectors
1029
1030    def __str__(self):
1031        v = []
1032        for selector in self.selectors:
1033            s = str(selector)
1034            if isinstance(selector, Combinator) and not isinstance(selector, self.__class__):
1035                s = "(%s)" % s
1036            v.append(s)
1037        return self.symbol.join(v)
1038
1039    def cssweight(self):
1040        raise TypeError("no weight info for chained combinator")
1041
1042
1043class OrCombinator(ChainedCombinator):
1044    """
1045    <par>An <class>OrCombinator</class> is a <class>ChainedCombinator</class> where
1046    the node must match at least one of the selectors to match the <class>OrCombinator</class>.
1047    An <class>OrCombinator</class> can be created with the binary or operator (<lit>|</lit>).</par>
1048
1049    <example>
1050    <tty>
1051    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1052    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1053    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1054    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasattr("href") | xfind.hasattr("src")</em>):</input>
1055    <prompt>... </prompt><input>\tprint node.bytes()</input>
1056    <prompt>... </prompt><input></input>
1057    <![CDATA[<link type="application/rss+xml" title="RSS" rel="alternate" href="http://www.python.org/channews.rdf" />
1058    <link media="screen" type="text/css" id="screen-switcher-stylesheet" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
1059    <link media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
1060    <link media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />
1061    <link media="screen" type="text/css" title="large text" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
1062    <link media="screen" type="text/css" title="default fonts" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />
1063    <script src="http://www.python.org/js/iotbs2-key-directors-load.js" type="text/javascript"></script>
1064    <script src="http://www.python.org/js/iotbs2-directors.js" type="text/javascript"></script>
1065    <script src="http://www.python.org/js/iotbs2-core.js" type="text/javascript"></script>
1066    <a accesskey="1" id="logolink" href="http://www.python.org/"><img alt="homepage" src="http://www.python.org/images/python-logo.gif" id="logo" border="0" /></a>]]>
1067    <rep>...</rep>
1068    </tty>
1069    </example>
1070    """
1071
1072    def match(self, path):
1073        return any(selector.match(path) for selector in self.selectors)
1074
1075    symbol = " | "
1076
1077    def __or__(self, other):
1078        return OrCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
1079
1080
1081class AndCombinator(ChainedCombinator):
1082    """
1083    <par>An <class>AndCombinator</class> is a <class>ChainedCombinator</class> where
1084    the node must match all of the combined selectors to match the <class>AndCombinator</class>.
1085    An <class>AndCombinator</class> can be created with the binary and operator (<lit>&amp;</lit>).</par>
1086
1087    <example>
1088    <tty>
1089    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1090    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1091    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1092    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.input & xfind.hasattr("id")</em>):</input>
1093    <prompt>... </prompt><input>\tprint node.bytes()</input>
1094    <prompt>... </prompt><input></input>
1095    <![CDATA[<input id="domains" name="domains" value="www.python.org" type="hidden" />
1096    <input id="sitesearch" name="sitesearch" value="www.python.org" type="hidden" />
1097    <input id="sourceid" name="sourceid" value="google-search" type="hidden" />
1098    <input id="q" class="input-text" name="q" type="text" />
1099    <input id="submit" value="search" name="submit" type="submit" class="input-button" />]]>
1100    </tty>
1101    </example>
1102    """
1103
1104    def match(self, path):
1105        return all(selector.match(path) for selector in self.selectors)
1106
1107    def __and__(self, other):
1108        return AndCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
1109
1110    symbol = " & "
1111
1112
1113class NotCombinator(Combinator):
1114    """
1115    <par>A <class>NotCombinator</class> inverts the selection logic of the
1116    underlying selector, i.e. a node matches only if it does not match the underlying
1117    selector. A <class>NotCombinator</class> can be created with the unary inversion operator (<lit>~</lit>).</par>
1118
1119    <par>The following example outputs all images that don't have a <lit>border</lit> attribute:</par>
1120
1121    <example>
1122    <tty>
1123    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1124    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1125    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1126    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.img &amp; ~xfind.hasattr("border")</em>):</input>
1127    <prompt>... </prompt><input>\tprint node.bytes()</input>
1128    <prompt>... </prompt><input></input>
1129    <![CDATA[<img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
1130    </tty>
1131    </example>
1132    """
1133
1134    def __init__(self, selector):
1135        self.selector = selector
1136
1137    def match(self, path):
1138        return not self.selector.match(path)
1139
1140    def __str__(self):
1141        if isinstance(self.selector, Combinator) and not isinstance(self.selector, NotCombinator):
1142            return "~(%s)" % self.selector
1143        else:
1144            return "~%s" % self.selector
1145
1146
1147class CallableSelector(Selector):
1148    """
1149    <par>A <class>CallableSelector</class> is a selector that calls a user specified
1150    callable to select nodes. The callable gets passed the path and must return
1151    a bool specifying whether this path is selected. A <class>CallableSelector</class>
1152    is created implicitely whenever a callable is passed to a method that expects
1153    a walk filter.</par>
1154
1155    <par>The following example outputs all links that point outside the <lit>python.org</lit> domain:</par>
1156
1157    <example>
1158    <tty>
1159    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1160    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1161    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1162    <prompt>>>> </prompt><input>def foreignlink(path):</input>
1163    <prompt>... </prompt><input>    return path and isinstance(path[-1], html.a) and not path[-1].attrs.href.asURL().server.endswith(".python.org")</input>
1164    <prompt>... </prompt><input></input>
1165    <prompt>>>> </prompt><input>for node in doc.walknode(<em>foreignlink</em>):</input>
1166    <prompt>... </prompt><input>\tprint node.bytes()</input>
1167    <prompt>... </prompt><input></input>
1168    <![CDATA[<a href="http://homegain.com/" class="reference">HomeGain.com</a>
1169    <a href="http://www.zope.org/">Zope</a>
1170    <a href="http://www.djangoproject.com/">Django</a>
1171    <a href="http://www.turbogears.org/">TurboGears</a>
1172    <a href="http://pyxml.sourceforge.net/topics/">XML</a>]]>
1173    <rep>..</rep>
1174    </tty>
1175    </example>
1176    """
1177    def __init__(self, func):
1178        self.func = func
1179
1180    def match(self, path):
1181        return self.func(path)
1182
1183    def __str__(self):
1184        return "%s(%r)" % (self.__class__.__name__, self.func)
1185
1186
1187class nthchild(Selector):
1188    def __init__(self, index):
1189        self.index = index
1190
1191    def match(self, path):
1192        if len(path) >= 2:
1193            if self.index in ("even", "odd"):
1194                for (i, child) in enumerate(path[-2]):
1195                    if child is path[-1]:
1196                        return (i % 2) == (self.index == "odd")
1197            else:
1198                try:
1199                    return path[-2][self.index] is path[-1]
1200                except IndexError:
1201                    return False
1202        return False
1203
1204    def __str__(self):
1205        return "%s(%r)" % (self.__class__.__name__, self.index)
1206
1207
1208class nthoftype(Selector):
1209    def __init__(self, index, *types):
1210        self.index = index
1211        self.types = types
1212
1213    def _find(self, path):
1214        types = self.types if self.types else path[-1].__class__
1215        for child in path[-2]:
1216            if isinstance(child, types):
1217                yield child
1218
1219    def match(self, path):
1220        if len(path) >= 2:
1221            if self.index in ("even", "odd"):
1222                for (i, child) in enumerate(self._find(path)):
1223                    if child is path[-1]:
1224                        return (i % 2) == (self.index == "odd")
1225            else:
1226                try:
1227                    return misc.item(self._find(path), self.index) is path[-1]
1228                except IndexError:
1229                    return False
1230        return False
1231
1232    def __str__(self):
1233        if self.types:
1234            return "%s(%r, %s)" % (self.__class__.__name__, self.index, ", ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types))
1235        else:
1236            return "%s(%r)" % (self.__class__.__name__, self.index)
1237
1238
1239###
1240### CSS helper functions
1241###
1242
1243def _is_nth_node(iterator, node, index):
1244    # Return whether node is the index'th node in iterator (starting at 1)
1245    # index is an int or int string or "even" or "odd"
1246    if index == "even":
1247        for (i, child) in enumerate(iterator):
1248            if child is node:
1249                return i % 2 == 1
1250        return False
1251    elif index == "odd":
1252        for (i, child) in enumerate(iterator):
1253            if child is node:
1254                return i % 2 == 0
1255        return False
1256    else:
1257        if not isinstance(index, (int, long)):
1258            try:
1259                index = int(index)
1260            except ValueError:
1261                raise ValueError("illegal argument %r" % index)
1262            else:
1263                if index < 1:
1264                    return False
1265        try:
1266            return iterator[index-1] is node
1267        except IndexError:
1268            return False
1269
1270
1271def _is_nth_last_node(iterator, node, index):
1272    # Return whether node is the index'th last node in iterator
1273    # index is an int or int string or "even" or "odd"
1274    if index == "even":
1275        pos = None
1276        for (i, child) in enumerate(iterator):
1277            if child is node:
1278                pos = i
1279        return pos is None or (i-pos) % 2 == 1
1280    elif index == "odd":
1281        pos = None
1282        for (i, child) in enumerate(iterator):
1283            if child is node:
1284                pos = i
1285        return pos is None or (i-pos) % 2 == 0
1286    else:
1287        if not isinstance(index, (int, long)):
1288            try:
1289                index = int(index)
1290            except ValueError:
1291                raise ValueError("illegal argument %r" % index)
1292            else:
1293                if index < 1:
1294                    return False
1295        try:
1296            return iterator[-index] is node
1297        except IndexError:
1298            return False
1299
1300
1301def _children_of_type(node, type):
1302    for child in node:
1303        if isinstance(child, xsc.Element) and child.xmlname == type:
1304            yield child
1305
1306
1307###
1308### CSS selectors
1309###
1310
1311class CSSWeightedSelector(Selector):
1312    def cssweight(self):
1313        return CSSWeight(0, 1, 0)
1314
1315
1316class CSSHasAttributeSelector(CSSWeightedSelector):
1317    def __init__(self, attributename):
1318        self.attributename = attributename
1319
1320    def match(self, path):
1321        if path:
1322            node = path[-1]
1323            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attributename):
1324                return node.attrs.has_xml(self.attributename)
1325        return False
1326
1327    def __str__(self):
1328        return "%s(%r)" % (self.__class__.__name__, self.attributename)
1329
1330
1331class CSSAttributeListSelector(CSSWeightedSelector):
1332    def __init__(self, attributename, attributevalue):
1333        self.attributename = attributename
1334        self.attributevalue = attributevalue
1335
1336    def match(self, path):
1337        if path:
1338            node = path[-1]
1339            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attributename):
1340                attr = node.attrs.get_xml(self.attributename)
1341                return self.attributevalue in unicode(attr).split()
1342        return False
1343
1344    def __str__(self):
1345        return "%s(%r, %r)" % (self.__class__.__name__, self.attributename, self.attributevalue)
1346
1347
1348class CSSAttributeLangSelector(CSSWeightedSelector):
1349    def __init__(self, attributename, attributevalue):
1350        self.attributename = attributename
1351        self.attributevalue = attributevalue
1352
1353    def match(self, path):
1354        if path:
1355            node = path[-1]
1356            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attributename):
1357                attr = node.attrs.get_xml(self.attributename)
1358                parts = unicode(attr).split("-", 1)
1359                if parts:
1360                    return parts[0] == self.attributevalue
1361        return False
1362
1363    def __str__(self):
1364        return "%s(%r, %r)" % (self.__class__.__name__, self.attributename, self.attributevalue)
1365
1366
1367class CSSFirstChildSelector(CSSWeightedSelector):
1368    def match(self, path):
1369        return len(path) >= 2 and _is_nth_node(path[-2][xsc.Element], path[-1], 1)
1370
1371    def __str__(self):
1372        return "CSSFirstChildSelector()"
1373
1374
1375class CSSLastChildSelector(CSSWeightedSelector):
1376    def match(self, path):
1377        return len(path) >= 2 and _is_nth_last_node(path[-2][xsc.Element], path[-1], 1)
1378
1379    def __str__(self):
1380        return "CSSLastChildSelector()"
1381
1382
1383class CSSFirstOfTypeSelector(CSSWeightedSelector):
1384    def match(self, path):
1385        if len(path) >= 2:
1386            node = path[-1]
1387            return isinstance(node, xsc.Element) and _is_nth_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, 1)
1388        return False
1389
1390    def __str__(self):
1391        return "CSSFirstOfTypeSelector()"
1392
1393
1394class CSSLastOfTypeSelector(CSSWeightedSelector):
1395    def match(self, path):
1396        if len(path) >= 2:
1397            node = path[-1]
1398            return isinstance(node, xsc.Element) and _is_nth_last_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, 1)
1399        return False
1400
1401    def __str__(self):
1402        return "CSSLastOfTypeSelector()"
1403
1404
1405class CSSOnlyChildSelector(CSSWeightedSelector):
1406    def match(self, path):
1407        if len(path) >= 2:
1408            node = path[-1]
1409            if isinstance(node, xsc.Element):
1410                for child in path[-2][xsc.Element]:
1411                    if child is not node:
1412                        return False
1413                return True
1414        return False
1415
1416    def __str__(self):
1417        return "CSSOnlyChildSelector()"
1418
1419
1420class CSSOnlyOfTypeSelector(CSSWeightedSelector):
1421    def match(self, path):
1422        if len(path) >= 2:
1423            node = path[-1]
1424            if isinstance(node, xsc.Element):
1425                for child in _children_of_type(path[-2], node.xmlname):
1426                    if child is not node:
1427                        return False
1428                return True
1429        return False
1430
1431    def __str__(self):
1432        return "CSSOnlyOfTypeSelector()"
1433
1434
1435class CSSEmptySelector(CSSWeightedSelector):
1436    def match(self, path):
1437        if path:
1438            node = path[-1]
1439            if isinstance(node, xsc.Element):
1440                for child in path[-1].content:
1441                    if isinstance(child, xsc.Element) or (isinstance(child, xsc.Text) and child):
1442                        return False
1443                return True
1444        return False
1445
1446    def __str__(self):
1447        return "CSSEmptySelector()"
1448
1449
1450class CSSRootSelector(CSSWeightedSelector):
1451    def match(self, path):
1452        return len(path) == 1 and isinstance(path[-1], xsc.Element)
1453
1454    def __str__(self):
1455        return "CSSRootSelector()"
1456
1457
1458class CSSLinkSelector(CSSWeightedSelector):
1459    def match(self, path):
1460        if path:
1461            node = path[-1]
1462            return isinstance(node, xsc.Element) and node.xmlns=="http://www.w3.org/1999/xhtml" and node.xmlname=="a" and "href" in node.attrs
1463        return False
1464
1465    def __str__(self):
1466        return "%s()" % self.__class__.__name__
1467
1468
1469class CSSInvalidPseudoSelector(CSSWeightedSelector):
1470    def match(self, path):
1471        return False
1472
1473    def __str__(self):
1474        return "%s()" % self.__class__.__name__
1475
1476
1477class CSSHoverSelector(CSSInvalidPseudoSelector):
1478    pass
1479
1480
1481class CSSActiveSelector(CSSInvalidPseudoSelector):
1482    pass
1483
1484
1485class CSSVisitedSelector(CSSInvalidPseudoSelector):
1486    pass
1487
1488
1489class CSSFunctionSelector(CSSWeightedSelector):
1490    def __init__(self, value=None):
1491        self.value = value
1492
1493    def __str__(self):
1494        return "%s(%r)" % (self.__class__.__name__, self.value)
1495
1496
1497class CSSNthChildSelector(CSSFunctionSelector):
1498    def match(self, path):
1499        if len(path) >= 2:
1500            node = path[-1]
1501            if isinstance(node, xsc.Element):
1502                return _is_nth_node(path[-2][xsc.Element], node, self.value)
1503        return False
1504
1505
1506class CSSNthLastChildSelector(CSSFunctionSelector):
1507    def match(self, path):
1508        if len(path) >= 2:
1509            node = path[-1]
1510            if isinstance(node, xsc.Element):
1511                return _is_nth_last_node(path[-2][xsc.Element], node, self.value)
1512        return False
1513
1514
1515class CSSNthOfTypeSelector(CSSFunctionSelector):
1516    def match(self, path):
1517        if len(path) >= 2:
1518            node = path[-1]
1519            if isinstance(node, xsc.Element):
1520                return _is_nth_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, self.value)
1521        return False
1522
1523
1524class CSSNthLastOfTypeSelector(CSSFunctionSelector):
1525    def match(self, path):
1526        if len(path) >= 2:
1527            node = path[-1]
1528            if isinstance(node, xsc.Element):
1529                return _is_nth_last_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, self.value)
1530        return False
1531
1532
1533class CSSTypeSelector(Selector):
1534    def __init__(self, type="*", xmlns="*", *selectors):
1535        self.type = type
1536        self.xmlns = xsc.nsname(xmlns)
1537        self.selectors = [] # id, class, attribute etc. selectors for this node
1538
1539    def match(self, path):
1540        if not path:
1541            return False
1542        node = path[-1]
1543        if self.type != "*" and node.xmlname != self.type:
1544            return False
1545        if self.xmlns != "*" and node.xmlns != self.xmlns:
1546            return False
1547        for selector in self.selectors:
1548            if not selector.match(path):
1549                return False
1550        return True
1551
1552    def __str__(self):
1553        v = [self.__class__.__name__, "("]
1554        if self.type != "*" or self.xmlns != "*" or self.selectors:
1555            v.append(repr(self.type))
1556        if self.xmlns != "*" or self.selectors:
1557            v.append(", ")
1558            v.append(repr(self.xmlns))
1559        for selector in self.selectors:
1560            v.append(", ")
1561            v.append(str(selector))
1562        v.append(")")
1563        return "".join(v)
1564
1565    def cssweight(self):
1566        result = CSSWeight(0, 0, int(self.type != "*"))
1567        for selector in self.selectors:
1568            result += selector.cssweight()
1569        return result
1570
1571
1572class CSSAdjacentSiblingCombinator(BinaryCombinator):
1573    """
1574    <par>A <class>CSSAdjacentSiblingCombinator</class> work similar to an
1575    <class>AdjacentSiblingCombinator</class> except that only preceding elements
1576    are considered.</par>
1577    """
1578
1579    def match(self, path):
1580        if len(path) >= 2 and self.right.match(path):
1581            # Find sibling
1582            node = path[-1]
1583            sibling = None
1584            for child in path[-2][xsc.Element]:
1585                if child is node:
1586                    break
1587                sibling = child
1588            if sibling is not None:
1589                return self.left.match(path[:-1]+[sibling])
1590        return False
1591
1592    def __str__(self):
1593        return "%s(%s, %s)" % (self.__class__.__name__, self.left, self.right)
1594
1595
1596class CSSGeneralSiblingCombinator(BinaryCombinator):
1597    """
1598    <par>A <class>CSSGeneralSiblingCombinator</class> work similar to an
1599    <class>GeneralSiblingCombinator</class> except that only preceding elements
1600    are considered.</par>
1601    """
1602
1603    def match(self, path):
1604        if len(path) >= 2 and self.right.match(path):
1605            node = path[-1]
1606            for child in path[-2][xsc.Element]:
1607                if child is node: # no previous element siblings
1608                    return False
1609                if self.left.match(path[:-1]+[child]):
1610                    return True
1611        return False
1612
1613    def __str__(self):
1614        return "%s(%s, %s)" % (self.__class__.__name__, self.left, self.right)
1615
1616
1617_attributecombinator2class = {
1618    "=": attrhasvalue_xml,
1619    "~=": CSSAttributeListSelector,
1620    "|=": CSSAttributeLangSelector,
1621    "^=": attrstartswith_xml,
1622    "$=": attrendswith_xml,
1623    "*=": attrcontains_xml,
1624}
1625
1626_combinator2class = {
1627    " ": DescendantCombinator,
1628    ">": ChildCombinator,
1629    "+": CSSAdjacentSiblingCombinator,
1630    "~": CSSGeneralSiblingCombinator,
1631}
1632
1633_pseudoname2class = {
1634    "first-child": CSSFirstChildSelector,
1635    "last-child": CSSLastChildSelector,
1636    "first-of-type": CSSFirstOfTypeSelector,
1637    "last-of-type": CSSLastOfTypeSelector,
1638    "only-child": CSSOnlyChildSelector,
1639    "only-of-type": CSSOnlyOfTypeSelector,
1640    "empty": CSSEmptySelector,
1641    "root": CSSRootSelector,
1642    "hover": CSSHoverSelector,
1643    "link": CSSLinkSelector,
1644    "visited": CSSVisitedSelector,
1645    "active": CSSActiveSelector,
1646}
1647
1648_function2class = {
1649    "nth-child": CSSNthChildSelector,
1650    "nth-last-child": CSSNthLastChildSelector,
1651    "nth-of-type": CSSNthOfTypeSelector,
1652    "nth-last-of-type": CSSNthLastOfTypeSelector,
1653}
1654
1655
1656def css(selectors, prefixes=None):
1657    """
1658    Create a walk filter that will yield all nodes that match the specified
1659    &css; expression. <arg>selectors</arg> can be a string or a
1660    <class>cssutils.css.selector.Selector</class> object. <arg>prefixes</arg>
1661    may is a mapping mapping namespace prefixes to namespace names.
1662    """
1663       
1664    if isinstance(selectors, basestring):
1665        if prefixes is not None:
1666            prefixes = dict((key, xsc.nsname(value)) for (key, value) in prefixes.iteritems())
1667            selectors = "%s\n%s{}" % ("\n".join("@namespace %s %r;" % (key if key is not None else "", value) for (key, value) in prefixes.iteritems()), selectors)
1668        else:
1669            selectors = "%s{}" % selectors
1670        for rule in cssutils.CSSParser().parseString(selectors).cssRules:
1671            if isinstance(rule, cssstylerule.CSSStyleRule):
1672                selectors = rule.selectorList
1673                break
1674        else:
1675            raise ValueError("can't happen")
1676    elif isinstance(selectors, cssstylerule.CSSStyleRule):
1677        selectors = selectors.selectorList
1678    elif isinstance(selectors, cssselector.Selector):
1679        selectors = [selectors]
1680    else:
1681        raise TypeError("can't handle %r" % type(selectors))
1682    orcombinators = []
1683    for selector in selectors:
1684        rule = root = CSSTypeSelector()
1685        prefix = None
1686        attributename = None
1687        attributevalue = None
1688        combinator = None
1689        inattr = False
1690        for x in selector.seq:
1691            t = x["type"]
1692            v = x["value"]
1693            if t == "prefix":
1694                prefix = v
1695            elif t == "pipe":
1696                if prefix != "*":
1697                    try:
1698                        xmlns = prefixes[prefix]
1699                    except KeyError:
1700                        raise xsc.IllegalPrefixError(prefix)
1701                    rule.xmlns = xmlns
1702                prefix = None
1703            elif t == "type":
1704                rule.type = v
1705            elif t == "id":
1706                rule.selectors.append(hasid(v.lstrip("#")))
1707            elif t == "classname":
1708                rule.selectors.append(hasclass(v))
1709            elif t == "pseudoname":
1710                try:
1711                    rule.selectors.append(_pseudoname2class[v]())
1712                except KeyError:
1713                    raise ValueError("unknown pseudoname %s" % v)
1714            elif t == "function":
1715                try:
1716                    rule.selectors.append(_function2class[v.rstrip("(")]())
1717                except KeyError:
1718                    raise ValueError("unknown function %s" % v)
1719                rule.function = v
1720            elif t == "functionvalue":
1721                rule.selectors[-1].value = v
1722            elif t == "attributename":
1723                attributename = v
1724            elif t == "attributevalue":
1725                if (v.startswith("'") and v.endswith("'")) or (v.startswith('"') and v.endswith('"')):
1726                    v = v[1:-1]
1727                attributevalue = v
1728            elif t == "attribute selector":
1729                combinator = None
1730                inattr = True
1731            elif t == "attribute selector end":
1732                if combinator is None:
1733                    rule.selectors.append(CSSHasAttributeSelector(attributename))
1734                else:
1735                    try:
1736                        rule.selectors.append(_attributecombinator2class[combinator](attributename, attributevalue))
1737                    except KeyError:
1738                        raise ValueError("unknown combinator %s" % attributevalue)
1739                inattr = False
1740            elif t == "combinator":
1741                if inattr:
1742                    combinator = v
1743                else:
1744                    try:
1745                        rule = CSSTypeSelector()
1746                        root = _combinator2class[v](root, rule)
1747                    except KeyError:
1748                        raise ValueError("unknown combinator %s" % v)
1749                    xmlns = "*"
1750        orcombinators.append(root)
1751    return orcombinators[0] if len(orcombinators) == 1 else OrCombinator(*orcombinators)
Note: See TracBrowser for help on using the browser.