root/livinglogic.python.xist/src/ll/xist/xfind.py @ 2820:162fcc3925fd

Revision 2820:162fcc3925fd, 57.5 KB (checked in by Walter Doerwald <walter@…>, 12 years ago)

Fix xfind.CSSNthOfTypeSelector and xfind.CSSNthLastOfTypeSelector. Add tests.

xfind._is_nth_node() and xfind._is_nth_last_node need an indexable iterator.
Add tests for the following CSS selectors: nth-child(), nth-last-child(),
nth-of-type(), nth-last-of-type().

Line 
1# -*- coding: iso-8859-1 -*-
2
3## Copyright 1999-2007 by LivingLogic AG, Bayreuth/Germany.
4## Copyright 1999-2007 by Walter Dörwald
5##
6## All Rights Reserved
7##
8## See xist/__init__.py for the license
9
10
11"""
12<par>This module contains XFind and CSS selectors and related classes and functions.</par>
13
14<par>A selector is a &xist; tree traversal filter that traverses the complete
15&xml; tree and outputs those nodes specified by the selector. Selectors can
16be combined with various operations and form a language comparable to
17<link href="http://www.w3.org/TR/xpath">XPath</link> but implemented as Python
18expressions. The following code shows some
19examples. First lets define some support code:</par>
20
21<example><title>Support code (put in <filename>help.py</filename>)</title>
22<prog>
23from ll.xist import xsc, xfind, parsers
24from ll.xist.ns import html
25
26node = parsers.parseURL("http://www.python.org", tidy=True)
27
28def output(selector):
29    for n in node.walknode(selector):
30        print n.bytes()
31</prog>
32</example>
33
34<par>We can now use this code in a Python session via <lit>from help import *</lit>.</par>
35
36<prog>
37<prompt>>>> </prompt><input>output(html.a/html.img) # images children of a elements</input>
38<![CDATA[<img src="/images/python-logo.gif" alt="homepage" id="logo" border="0" />
39<img id="skiptonav" alt="skip to navigation" src="/images/trans.gif" border="0" />
40<img id="skiptocontent" alt="skip to content" src="/images/trans.gif" border="0" />
41<img alt="success story photo" class="success" src="/images/success/nasa.jpg" />]]>
42
43<prompt>>>> </prompt><input>output(html.ul//html.a) # a descendants of ul elements</input>
44<![CDATA[<a title="About The Python Language" href="/about/">About</a>
45<a title="Major Happenings Within the Python Community" href="/news/">News</a>
46<a title="Tutorials, Library Reference, C API" href="/doc/">Documentation</a>]]>
47
48<prompt>>>> </prompt><input>output(html.img & xfind.attrendswith("src", ".jpg")) # JPEG images</input>
49<![CDATA[<img alt="success story photo" class="success" src="/images/success/nasa.jpg" />]]>
50
51<prompt>>>> </prompt><input>output(html.img & ~xfind.hasattr("title")) # All images without a title attribute</input>
52<![CDATA[<img src="/images/python-logo.gif" border="0" id="logo" alt="homepage" />
53<img id="skiptonav" border="0" src="/images/trans.gif" alt="skip to navigation" />
54<img id="skiptocontent" border="0" src="/images/trans.gif" alt="skip to content" />
55<img alt="success story photo" src="/images/success/nasa.jpg" class="success" />]]>
56
57<prompt>>>> </prompt><input>output(html.a & xfind.hasclass("reference")) # Links with 'reference' class</input>
58<![CDATA[<a class="reference" href="/search">Advanced Search</a>
59<a href="about/success/rackspace" class="reference">Rackspace</a>
60<a href="about/success/ilm" class="reference">Industrial Light and Magic</a>]]>
61
62<prompt>>>> </prompt><input>output(html.ul/html.li[0]) # Every li element that is the first li child of its ul parent</input>
63<![CDATA[<li>
64          <a title="About The Python Language" href="/about/">About</a>
65        </li>
66<li><a title="Manuals for Latest Stable Release" href="http://docs.python.org/">Documentation</a></li>
67<li class="group"><a href="http://wiki.python.org/moin/WebProgramming">Web Programming</a></li>]]>
68
69</prog>
70"""
71
72__version__ = "$Revision$".split()[1]
73# $Source$
74
75
76try:
77    import cssutils
78    from cssutils.css import cssstylerule
79    from cssutils.css import cssnamespacerule
80except ImportError:
81    pass
82
83from ll import misc
84from ll.xist import xsc
85
86
87class Selector(xsc.WalkFilter):
88    """
89    Base class for all tree traversal filters that visit the complete tree.
90    Whether a node gets output can be specified by overwriting the
91    <method>match</method> method. Selectors can be combined with various
92    operations (see methods below).
93    """
94
95    @misc.notimplemented
96    def match(self, path):
97        pass
98
99    def filter(self, path):
100        return (True, xsc.entercontent, xsc.enterattrs) if self.match(path) else (xsc.entercontent, xsc.enterattrs)
101
102    def __div__(self, other):
103        return ChildCombinator(self, xsc.makewalkfilter(other))
104
105    def __floordiv__(self, other):
106        return DescendantCombinator(self, xsc.makewalkfilter(other))
107
108    def __mul__(self, other):
109        return AdjacentSiblingCombinator(self, xsc.makewalkfilter(other))
110
111    def __pow__(self, other):
112        return GeneralSiblingCombinator(self, xsc.makewalkfilter(other))
113
114    def __and__(self, other):
115        return AndCombinator(self, xsc.makewalkfilter(other))
116
117    def __or__(self, other):
118        return OrCombinator(self, xsc.makewalkfilter(other))
119
120    def __invert__(self):
121        return NotCombinator(self)
122
123
124class IsInstanceSelector(Selector):
125    """
126    <par>Selector that selects all nodes that are instances of the specified type.
127    You can either create an <class>IsInstanceSelector</class> object directly
128    or simply pass a class to a function that expects a walk filter.</par>
129
130    <example>
131    <tty>
132    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
133    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
134    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
135    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.a</em>):</input>
136    <prompt>... </prompt><input>\tprint node.bytes()</input>
137    <prompt>... </prompt><input></input>
138    <![CDATA[<a id="logolink" accesskey="1" href="http://www.python.org/"><img src="http://www.python.org/images/python-logo.gif" id="logo" border="0" alt="homepage" /></a>
139    <a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" src="http://www.python.org/images/trans.gif" border="0" alt="skip to navigation" /></a>
140    <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" src="http://www.python.org/images/trans.gif" border="0" alt="skip to content" /></a>
141    <a class="reference" href="http://www.python.org/search">Advanced Search</a>
142    <a title="About The Python Language" href="http://www.python.org/about/">About</a>]]>
143    <rep>...</rep>
144    </tty>
145    </example>
146    """
147    def __init__(self, *types):
148        self.types = types
149
150    def match(self, path):
151        if path:
152            return isinstance(path[-1], self.types)
153        return False
154
155    def __or__(self, other):
156        # If other is a type check too, combine self and other into one isinstance instance
157        if isinstance(other, xsc._Node_Meta):
158            return IsInstanceSelector(*(self.types + (other,)))
159        elif isinstance(other, IsInstanceSelector):
160            return IsInstanceSelector(*(self.types+other.types))
161        return Selector.__or__(self, other)
162
163    def __getitem__(self, index):
164        return nthoftype(index, *self.types)
165
166    def __str__(self):
167        if len(self.types) == 1:
168            return "%s.%s" % (self.types[0].__module__, self.types[0].__name__)
169        else:
170            return "(%s)" % " | ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types)
171
172
173class hasname(Selector):
174    """
175    <par>Selector that selects all nodes that have a specified Python name (which
176    only selects elements, processing instructions and entities). Also a namespace
177    name can be specified as a second argument, which will only select elements
178    from the specified namespace.</par>
179
180    <example>
181    <tty>
182    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
183    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
184    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasname("img")</em>):</input>
185    <prompt>... </prompt><input>\tprint node.bytes()</input>
186    <prompt>... </prompt><input></input>
187    <![CDATA[<img border="0" src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" />
188    <img border="0" id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" />
189    <img border="0" id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" />
190    <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
191    </tty>
192    </example>
193    """
194    def __init__(self, name, xmlns=None):
195        self.name = name
196        self.xmlns = xsc.nsname(xmlns)
197
198    def match(self, path):
199        if path:
200            node = path[-1]
201            if self.xmlns is not None:
202                return isinstance(node, xsc.Element) and node.__class__.__name__ == self.name and node.xmlns == self.xmlns
203            else:
204                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.__class__.__name__ == self.name
205        return False
206
207    def __str__(self):
208        return "%s(%r)" % (self.__class__.__name__, self.name)
209
210
211class hasname_xml(Selector):
212    """
213    <class>hasname_xml</class> works similar to <pyref class="hasname"><class>hasname</class></pyref>
214    except that the specified name is treated as the &xml; name, not the Python name.
215    """
216    def __init__(self, name, xmlns=None):
217        self.name = name
218        self.xmlns = xsc.nsname(xmlns)
219
220    def match(self, path):
221        if path:
222            node = path[-1]
223            if self.xmlns is not None:
224                return isinstance(node, xsc.Element) and node.xmlname == self.name and node.xmlns == self.xmlns
225            else:
226                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.xmlname == self.name
227        return False
228
229    def __str__(self):
230        return "%s(%r)" % (self.__class__.__name__, self.name)
231
232
233class IsSelector(Selector):
234    """
235    <par>Selector that selects one specific node in the tree. This can be
236    combined with other selectors via <pyref class="ChildCombinator"><class>ChildCombinator</class>s</pyref>
237    or <pyref class="DescendantCombinator"><class>DescendantCombinator</class>s</pyref>
238    to select children of this specific node. You can either create an
239    <class>IsSelector</class> directly or simply pass a node to a function that
240    expects a walk filter.</par>
241
242    <example>
243    <tty>
244    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
245    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
246    <prompt>>>> </prompt><input>for node in doc.walknode(<em>doc[0]/xsc.Element</em>):</input>
247    <prompt>... </prompt><input>\tprint repr(node)</input>
248    <prompt>... </prompt><input></input>
249    <![CDATA[<ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6c82f4c>
250    <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6c3154c>]]>
251    </tty>
252    </example>
253    """
254    def __init__(self, node):
255        self.node = node
256
257    def match(self, path):
258        return path and path[-1] is self.node
259
260    def __str__(self):
261        return "%s(%r)" % (self.__class__.__name__, self.node)
262
263
264class isroot(Selector):
265    def match(self, path):
266        return len(path) == 1
267
268    def __str__(self):
269        return "isroot"
270
271
272isroot = isroot()
273
274
275class empty(Selector):
276    """
277    <par>Selector that selects all empty elements or fragments.</par>
278
279    <example>
280    <tty>
281    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
282    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
283    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.empty</em>):</input>
284    <prompt>... </prompt><input>\tprint node.bytes()</input>
285    <prompt>... </prompt><input></input>
286    <![CDATA[<meta content="text/html; charset=utf-8" http-equiv="content-type" />
287    <meta content="python programming language object oriented web free source" name="keywords" />
288    <meta content="      Home page for Python, an interpreted, interactive, object-oriented, extensible
289          programming language. It provides an extraordinary combination of clarity and
290          versatility, and is free and comprehensively ported. " name="description" />
291    <link type="application/rss+xml" href="http://www.python.org/channews.rdf" rel="alternate" title="RSS" />]]>
292    <rep>...</rep>
293    </tty>
294    </example>
295    """
296
297    def match(self, path):
298        if path:
299            node = path[-1]
300            if isinstance(node, (xsc.Element, xsc.Frag)):
301                return len(node) == 0
302        return False
303
304    def __str__(self):
305        return "empty"
306
307
308empty = empty()
309
310
311class onlychild(Selector):
312    """
313    <par>Selector that selects all node that are the only child of their parents.</par>
314
315    <example>
316    <tty>
317    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
318    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
319    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.onlychild &amp; html.a</em>):</input>
320    <prompt>... </prompt><input>\tprint node.bytes()</input>
321    <prompt>... </prompt><input></input>
322    <![CDATA[<a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" /></a>
323    <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" /></a>
324    <a href="http://www.python.org/download/releases/2.5.1">Quick Links (2.5.1)</a>
325    <a title="Manuals for Latest Stable Release" href="http://docs.python.org/">Documentation</a>]]>
326    <rep>...</rep>
327    </tty>
328    </example>
329    """
330
331    def match(self, path):
332        if len(path) >= 2:
333            parent = path[-2]
334            if isinstance(parent, (xsc.Frag, xsc.Element)):
335                return len(parent)==1 and parent[0] is path[-1]
336        return False
337
338    def __str__(self):
339        return "onlychild"
340
341
342onlychild = onlychild()
343
344
345class onlyoftype(Selector):
346    """
347    <par>Selector that selects all nodes that are the only nodes of their type among
348    their siblings.</par>
349
350    <example>
351    <tty>
352    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
353    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
354    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.onlyoftype &amp; xsc.Element</em>):</input>
355    <prompt>... </prompt><input>\tprint repr(node)</input>
356    <prompt>... </prompt><input></input>
357    <![CDATA[<ll.xist.ns.html.html element object (2 children/1 attr) (from http://www.python.org/:4:?) at 0xb6d6e7ec>
358    <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6cc1f8c>
359    <ll.xist.ns.html.title element object (1 child/no attrs) (from http://www.python.org/:8:?) at 0xb6d79b8c>
360    <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6d7282c>]]>
361    <rep>...</rep>
362    </tty>
363    </example>
364    """
365
366    def match(self, path):
367        if len(path) >= 2:
368            node = path[-1]
369            parent = path[-2]
370            if isinstance(parent, (xsc.Frag, xsc.Element)):
371                for child in parent:
372                    if isinstance(child, node.__class__):
373                        if child is not node:
374                            return False
375                return True
376        return False
377
378    def __str__(self):
379        return "onlyoftype"
380
381
382onlyoftype = onlyoftype()
383
384
385class hasattr(Selector):
386    """
387    <par>Selector that selects all element nodes that have an attribute with one
388    of the specified Python names. For selecting nodes with global attributes
389    the attribute class can be passed.</par>
390
391    <example>
392    <tty>
393    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
394    <prompt>>>> </prompt><input>from ll.xist.ns import html, xml</input>
395    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
396    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.onlyoftype &amp; xsc.Element</em>):</input>
397    <prompt>... </prompt><input>\tprint repr(node)</input>
398    <prompt>... </prompt><input></input>
399    <![CDATA[<ll.xist.ns.html.html element object (2 children/1 attr) (from http://www.python.org/:4:?) at 0xb6d6e7ec>
400    <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6cc1f8c>
401    <ll.xist.ns.html.title element object (1 child/no attrs) (from http://www.python.org/:8:?) at 0xb6d79b8c>
402    <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6d7282c>]]>
403    <rep>...</rep>
404    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasattr(xml.Attrs.lang</em>):</input>
405    <prompt>... </prompt><input>\tprint repr(node)</input>
406    <prompt>... </prompt><input></input>
407    <![CDATA[<ll.xist.ns.html.html element object (2 children/2 attrs) (from http://www.python.org/:4:?) at 0xb6d71d4c>]]>
408    </tty>
409    </example>
410    """
411
412    def __init__(self, *attrnames):
413        self.attrnames = attrnames
414
415    def match(self, path):
416        if path:
417            node = path[-1]
418            if isinstance(node, xsc.Element):
419                for attrname in self.attrnames:
420                    if node.Attrs.isallowed(attrname) and node.attrs.has(attrname):
421                        return True
422        return False
423
424    def __str__(self):
425        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
426
427
428class hasattr_xml(Selector):
429    """
430    <class>hasattr_xml</class> works similar to <pyref class="hasattr"><class>hasattr</class></pyref>
431    except that the specified names are treated as &xml; names instead of Python names.
432    """
433
434    def __init__(self, *attrnames):
435        self.attrnames = attrnames
436
437    def match(self, path):
438        if path:
439            node = path[-1]
440            if isinstance(node, xsc.Element):
441                for attrname in self.attrnames:
442                    if node.Attrs.isallowed_xml(attrname) and node.attrs.has_xml(attrname):
443                        return True
444        return False
445
446    def __str__(self):
447        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
448
449
450class attrhasvalue(Selector):
451    """
452    <par>Selector that selects all element nodes where an attribute with the
453    specified Python name has the specified value. For global attributes
454    the attribute class can be passed. Note that
455    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref> attributes
456    will not be considered.</par>
457
458    <example>
459    <tty>
460    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
461    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
462    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrhasvalue("rel", "stylesheet")</em>):</input>
463    <prompt>... </prompt><input>\tprint repr(node)</input>
464    <prompt>... </prompt><input></input>
465    <![CDATA[<link media="screen" type="text/css" href="http://www.python.org/styles/screen-switcher-default.css" rel="stylesheet" id="screen-switcher-stylesheet" />
466    <link media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
467    <link media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />]]>
468    </tty>
469    </example>
470    """
471
472    def __init__(self, attrname, attrvalue):
473        self.attrname = attrname
474        self.attrvalue = attrvalue
475
476    def match(self, path):
477        if path:
478            node = path[-1]
479            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
480                attr = node.attrs.get(self.attrname)
481                if not attr.isfancy(): # if there are PIs, say no
482                    return unicode(attr) == self.attrvalue
483        return False
484
485    def __str__(self):
486        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
487
488
489class attrhasvalue_xml(Selector):
490    """
491    <class>attrhasvalue_xml</class> works similar to <pyref class="attrhasvalue"><class>attrhasvalue</class></pyref>
492    except that the specified name is treated as an &xml; name instead of a Python name.
493    """
494
495    def __init__(self, attrname, attrvalue):
496        self.attrname = attrname
497        self.attrvalue = attrvalue
498
499    def match(self, path):
500        if path:
501            node = path[-1]
502            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
503                attr = node.attrs.get_xml(self.attrname)
504                if not attr.isfancy(): # if there are PIs, say no
505                    return unicode(attr) == self.attrvalue
506        return False
507
508    def __str__(self):
509        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
510
511
512class attrcontains(Selector):
513    """
514    <par>Selector that selects all element nodes where an attribute with the
515    specified Python name contains the specified subtring in its value. For
516    global attributes the attribute class can be passed. Note that
517    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref>
518    attributes will not be considered.</par>
519
520    <example>
521    <tty>
522    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
523    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
524    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrcontains("rel", "stylesheet")</em>):</input>
525    <prompt>... </prompt><input>\tprint repr(node)</input>
526    <prompt>... </prompt><input></input>
527    <![CDATA[<link type="text/css" id="screen-switcher-stylesheet" media="screen" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
528    <link type="text/css" media="scReen" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
529    <link type="text/css" media="print" rel="stylesheet" href="http://www.python.org/styles/print.css" />
530    <link type="text/css" title="large text" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
531    <link type="text/css" title="default fonts" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />]]>
532    </tty>
533    </example>
534    """
535
536    def __init__(self, attrname, attrvalue):
537        self.attrname = attrname
538        self.attrvalue = attrvalue
539
540    def match(self, path):
541        if path:
542            node = path[-1]
543            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
544                attr = node.attrs.get(self.attrname)
545                if not attr.isfancy(): # if there are PIs, say no
546                    return self.attrvalue in unicode(attr)
547        return False
548
549    def __str__(self):
550        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
551
552
553class attrcontains_xml(Selector):
554    """
555    <class>attrcontains_xml</class> works similar to <pyref class="attrcontains"><class>attrcontains</class></pyref>
556    except that the specified name is treated as an &xml; name instead of a Python name.
557    """
558
559    def __init__(self, attrname, attrvalue):
560        self.attrname = attrname
561        self.attrvalue = attrvalue
562
563    def match(self, path):
564        if path:
565            node = path[-1]
566            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
567                attr = node.attrs.get_xml(self.attrname)
568                if not attr.isfancy(): # if there are PIs, say no
569                    return self.attrvalue in unicode(attr)
570        return False
571
572    def __str__(self):
573        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
574
575
576class attrstartswith(Selector):
577    """
578    <par>Selector that selects all element nodes where an attribute with the
579    specified Python name starts with the specified string. For global attributes
580    the attribute class can be passed. Note that
581    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref> attributes
582    will not be considered.</par>
583
584    <example>
585    <tty>
586    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
587    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
588    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrstartswith("class_", "input-")</em>):</input>
589    <prompt>... </prompt><input>\tprint repr(node)</input>
590    <prompt>... </prompt><input></input>
591    <![CDATA[<input class="input-text" id="q" type="text" name="q" />
592    <input value="search" class="input-button" id="submit" type="submit" name="submit" />]]>
593    </tty>
594    </example>
595    """
596
597    def __init__(self, attrname, attrvalue):
598        self.attrname = attrname
599        self.attrvalue = attrvalue
600
601    def match(self, path):
602        if path:
603            node = path[-1]
604            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
605                attr = node.attrs.get(self.attrname)
606                if not attr.isfancy(): # if there are PIs, say no
607                    return unicode(attr).startswith(self.attrvalue)
608        return False
609
610    def __str__(self):
611        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
612
613
614class attrstartswith_xml(Selector):
615    """
616    <class>attrstartswith_xml</class> works similar to <pyref class="attrstartswith"><class>attrstartswith</class></pyref>
617    except that the specified name is treated as an &xml; name instead of a Python name.
618    """
619
620    def __init__(self, attrname, attrvalue):
621        self.attrname = attrname
622        self.attrvalue = attrvalue
623
624    def match(self, path):
625        if path:
626            node = path[-1]
627            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
628                attr = node.attrs.get_xml(self.attrname)
629                if not attr.isfancy(): # if there are PIs, say no
630                    return unicode(attr).startswith(self.attrvalue)
631        return False
632
633    def __str__(self):
634        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
635
636
637class attrendswith(Selector):
638    """
639    <par>Selector that selects all element nodes where an attribute with the
640    specified Python name ends with the specified string. For global attributes
641    the attribute class can be passed. Note that
642    <pyref module="ll.xist.xsc" class="Attr" method="isfancy">fancy</pyref> attributes
643    will not be considered.</par>
644
645    <example>
646    <tty>
647    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
648    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
649    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.attrendswith("href", ".css")</em>):</input>
650    <prompt>... </prompt><input>\tprint repr(node)</input>
651    <prompt>... </prompt><input></input>
652    <![CDATA[<link href="http://www.python.org/styles/screen-switcher-default.css" type="text/css" rel="stylesheet" id="screen-switcher-stylesheet" media="screen" />
653    <link type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" media="scReen" />
654    <link type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" media="print" />
655    <link title="large text" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" media="screen" />
656    <link title="default fonts" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" media="screen" />]]>
657    </tty>
658    </example>
659    """
660
661    def __init__(self, attrname, attrvalue):
662        self.attrname = attrname
663        self.attrvalue = attrvalue
664
665    def match(self, path):
666        if path:
667            node = path[-1]
668            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
669                attr = node.attrs.get(self.attrname)
670                if not attr.isfancy(): # if there are PIs, say no
671                    return unicode(attr).endswith(self.attrvalue)
672        return False
673
674    def __str__(self):
675        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
676
677
678class attrendswith_xml(Selector):
679    """
680    <class>attrendswith_xml</class> works similar to <pyref class="attrendswith"><class>attrendswith</class></pyref>
681    except that the specified name is treated as an &xml; name instead of a Python name.
682    """
683
684    def __init__(self, attrname, attrvalue):
685        self.attrname = attrname
686        self.attrvalue = attrvalue
687
688    def match(self, path):
689        if path:
690            node = path[-1]
691            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
692                attr = node.attrs.get_xml(self.attrname)
693                if not attr.isfancy(): # if there are PIs, say no
694                    return unicode(attr).endswith(self.attrvalue)
695        return False
696
697    def __str__(self):
698        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
699
700
701class hasid(Selector):
702    """
703    <par>Selector that selects all element nodes where the <lit>id</lit> attribute
704    has the specified value.</par>
705    <example>
706    <tty>
707    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
708    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
709    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasid("logo")</em>):</input>
710    <prompt>... </prompt><input>\tprint node.bytes()</input>
711    <prompt>... </prompt><input></input>
712    <![CDATA[<img src="http://www.python.org/images/python-logo.gif" id="logo" alt="homepage" border="0" />]]>
713    </tty>
714    </example>
715    """
716
717    def __init__(self, id):
718        self.id = id
719
720    def match(self, path):
721        if path:
722            node = path[-1]
723            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("id"):
724                attr = node.attrs.get_xml("id")
725                if not attr.isfancy():
726                    return unicode(attr) == self.id
727        return False
728
729    def __str__(self):
730        return "%s(%r)" % (self.__class__.__name__, self.id)
731
732
733class hasclass(Selector):
734    """
735    <par>Selector that selects all element nodes where the <lit>class</lit> attribute
736    has the specified value.</par>
737    <example>
738    <tty>
739    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
740    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
741    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasclass("reference")</em>):</input>
742    <prompt>... </prompt><input>\tprint node.bytes()</input>
743    <prompt>... </prompt><input></input>
744    <![CDATA[<a class="reference" href="http://www.python.org/search">Advanced Search</a>
745    <a href="http://www.python.org/about/success/rackspace" class="reference">Rackspace</a>
746    <a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
747    <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>]]>
748    <rep>...</rep>
749    </tty>
750    </example>
751    """
752
753    def __init__(self, classname):
754        self.classname = classname
755
756    def match(self, path):
757        if path:
758            node = path[-1]
759            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("class"):
760                attr = node.attrs.get_xml("class")
761                if not attr.isfancy():
762                    return self.classname in unicode(attr).split()
763        return False
764
765    def __str__(self):
766        return "%s(%r)" % (self.__class__.__name__, self.classname)
767
768
769class inattr(Selector):
770    """
771    <par>Selector that selects all attribute nodes and nodes inside of attributes.</par>
772    <example>
773    <tty>
774    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
775    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
776    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.inattr &amp; xsc.Text</em>):</input>
777    <prompt>... </prompt><input>\tprint node.bytes()</input>
778    <prompt>... </prompt><input></input>
779    text/html; charset=utf-8
780    content-type
781    python programming language object oriented web free source
782    <rep>...</rep>
783    </tty>
784    </example>
785    """
786    def match(self, path):
787        return any(isinstance(node, xsc.Attr) for node in path)
788
789    def __str__(self):
790        return "inattr"
791
792
793inattr = inattr()
794
795
796class Combinator(Selector):
797    """
798    <par>A <class>Combinator</class> is a selector that transforms one or combines
799    two or more other selectors in a certain way.</par>
800    """
801
802
803class BinaryCombinator(Combinator):
804    """
805    <par>A <class>BinaryCombinator</class> is a combinator that combines two selector:
806    the left hand selector and the right hand selector.</par>
807    """
808    symbol = None
809
810    def __init__(self, left, right):
811        self.left = left
812        self.right = right
813
814    def __str__(self):
815        left = str(self.left)
816        if isinstance(self.left, Combinator) and not isinstance(self.left, self.__class__):
817            left = "(%s)" % left
818        right = str(self.right)
819        if isinstance(self.right, Combinator) and not isinstance(self.right, self.__class__):
820            right = "(%s)" % right
821        return "%s%s%s" % (left, self.symbol, right)
822
823
824class ChildCombinator(BinaryCombinator):
825    """
826    <par>A <class>ChildCombinator</class> is a <class>BinaryCombinator</class>.
827    To match the <class>ChildCombinator</class> the node must match the
828    right hand selector and it's immediate parent must match the left hand
829    selector (i.e. it works similar to the <lit>&gt;</lit> combinator in &css;
830    or the <lit>/</lit> combinator in XPath.</par>
831
832    <par><class>ChildCombinator</class>s can be created via the division operator (<lit>/</lit>):</par>
833
834    <example>
835    <tty>
836    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
837    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
838    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
839    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.a/html.img</em>):</input>
840    <prompt>... </prompt><input>\tprint node.bytes()</input>
841    <prompt>... </prompt><input></input>
842    <![CDATA[<img src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" border="0" />
843    <img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
844    <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
845    <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
846    </tty>
847    </example>
848    """
849    def match(self, path):
850        if path and self.right.match(path):
851            return self.left.match(path[:-1])
852        return False
853
854    symbol = " / "
855
856
857class DescendantCombinator(BinaryCombinator):
858    """
859    <par>A <class>DescendantCombinator</class> is a <class>BinaryCombinator</class>.
860    To match the <class>DescendantCombinator</class> the node must match the
861    right hand selector and any of it's ancestor nodes must match the left hand
862    selector (i.e. it works similar to the descendant combinator in &css;
863    or the <lit>//</lit> combinator in XPath.</par>
864
865    <par><class>DescendantCombinator</class>s can be created via the floor division
866    operator (<lit>//</lit>):</par>
867
868    <example>
869    <tty>
870    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
871    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
872    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
873    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.div//html.img</em>):</input>
874    <prompt>... </prompt><input>\tprint node.bytes()</input>
875    <prompt>... </prompt><input></input>
876    <![CDATA[<img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
877    <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
878    <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
879    </tty>
880    </example>
881    """
882    def match(self, path):
883        if path and self.right.match(path):
884            while path:
885                path = path[:-1]
886                if self.left.match(path):
887                    return True
888        return False
889
890    symbol = " // "
891
892
893class AdjacentSiblingCombinator(BinaryCombinator):
894    """
895    <par>A <class>AdjacentSiblingCombinator</class> is a <class>BinaryCombinator</class>.
896    To match the <class>AdjacentSiblingCombinator</class> the node must match the
897    right hand selector and the immediately preceding sibling must match the left
898    hand selector.</par>
899
900    <par><class>AdjacentSiblingCombinator</class>s can be created via the
901    multiplication operator (<lit>*</lit>). The following example outputs all links
902    inside those <class>p</class> elements that immediately follow a
903    <class>h2</class> element:</par>
904
905    <example>
906    <tty>
907    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
908    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
909    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
910    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.h2*html.p/html.a</em>):</input>
911    <prompt>... </prompt><input>\tprint node.bytes()</input>
912    <prompt>... </prompt><input></input>
913    <![CDATA[<a href="http://www.scipy.org/SciPy2007" class="reference">SciPy Conference</a>
914    <a href="https://www.enthought.com/scipy07/" class="reference">early registration</a>
915    <a href="http://www.europython.org/sections/registration_issues/how-to-register" class="reference">Online registration</a>
916    <a href="http://europython.org/" class="reference">EuroPython 2007</a>
917    <a href="http://www.osdc.com.au/papers/cfp.html" class="reference">Call For Papers</a>
918    <a href="http://www.swa.hpi.uni-potsdam.de/dls07/" class="reference">DLS 2007</a>
919    <a href="http://pythonpapers.cgpublisher.com/" class="reference">The Python Papers</a>
920    <a href="http://www.pyconuk.org/" class="reference">PyCon UK</a>
921    <a href="http://www.pyconuk.org/submit.html" class="reference">proposals for talks</a>
922    <a href="http://www.pycon.it/registration/" class="reference">registration online</a>]]>
923    </tty>
924    </example>
925    """
926
927    def match(self, path):
928        if len(path) >= 2 and self.right.match(path):
929            # Find sibling
930            node = path[-1]
931            sibling = None
932            for child in path[-2]:
933                if child is node:
934                    break
935                sibling = child
936            if sibling is not None:
937                return self.left.match(path[:-1]+[sibling])
938        return False
939
940    symbol = " * "
941
942
943class GeneralSiblingCombinator(BinaryCombinator):
944    """
945    <par>A <class>GeneralSiblingCombinator</class> is a <class>BinaryCombinator</class>.
946    To match the <class>GeneralSiblingCombinator</class> the node must match the
947    right hand selector and any of the preceding siblings must match the left
948    hand selector.</par>
949
950    <par><class>AdjacentSiblingCombinator</class>s can be created via the
951    exponentiation operator (<lit>**</lit>). The following example outputs all links
952    that are not the first links inside their parent (i.e. they have another link
953    among their preceding siblings):</par>
954
955    <example>
956    <tty>
957    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
958    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
959    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
960    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.a**html.a</em>):</input>
961    <prompt>... </prompt><input>\tprint node.bytes()</input>
962    <prompt>... </prompt><input></input>
963    <![CDATA[<a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
964    <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
965    <a href="http://www.python.org/about/success/honeywell" class="reference">Honeywell</a>
966    <a href="http://www.python.org/about/success" class="reference">and many others</a>
967    <a href="http://www.zope.org/">Zope</a>]]>
968    <rep>...</rep>
969    </tty>
970    </example>
971    """
972
973    def match(self, path):
974        if len(path) >= 2 and self.right.match(path):
975            node = path[-1]
976            for child in path[-2]:
977                if child is node: # no previous siblings
978                    return False
979                if self.left.match(path[:-1]+[child]):
980                    return True
981        return False
982
983    symbol = " ** "
984
985
986class ChainedCombinator(Combinator):
987    """
988    <par>A <class>ChainedCombinator</class> combines any number of other
989    selectors.</par>
990    """
991
992    symbol = None
993
994    def __init__(self, *selectors):
995        self.selectors = selectors
996
997    def __str__(self):
998        v = []
999        for selector in self.selectors:
1000            s = str(selector)
1001            if isinstance(selector, Combinator) and not isinstance(selector, self.__class__):
1002                s = "(%s)" % s
1003            v.append(s)
1004        return self.symbol.join(v)
1005
1006
1007class OrCombinator(ChainedCombinator):
1008    """
1009    <par>An <class>OrCombinator</class> is a <class>ChainedCombinator</class> where
1010    the node must match at least one of the selectors to match the <class>OrCombinator</class>.
1011    An <class>OrCombinator</class> can be created with the binary or operator (<lit>|</lit>).</par>
1012
1013    <example>
1014    <tty>
1015    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1016    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1017    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1018    <prompt>>>> </prompt><input>for node in doc.walknode(<em>xfind.hasattr("href") | xfind.hasattr("src")</em>):</input>
1019    <prompt>... </prompt><input>\tprint node.bytes()</input>
1020    <prompt>... </prompt><input></input>
1021    <![CDATA[<link type="application/rss+xml" title="RSS" rel="alternate" href="http://www.python.org/channews.rdf" />
1022    <link media="screen" type="text/css" id="screen-switcher-stylesheet" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
1023    <link media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
1024    <link media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />
1025    <link media="screen" type="text/css" title="large text" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
1026    <link media="screen" type="text/css" title="default fonts" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />
1027    <script src="http://www.python.org/js/iotbs2-key-directors-load.js" type="text/javascript"></script>
1028    <script src="http://www.python.org/js/iotbs2-directors.js" type="text/javascript"></script>
1029    <script src="http://www.python.org/js/iotbs2-core.js" type="text/javascript"></script>
1030    <a accesskey="1" id="logolink" href="http://www.python.org/"><img alt="homepage" src="http://www.python.org/images/python-logo.gif" id="logo" border="0" /></a>]]>
1031    <rep>...</rep>
1032    </tty>
1033    </example>
1034    """
1035
1036    def match(self, path):
1037        return any(selector.match(path) for selector in self.selectors)
1038
1039    symbol = " | "
1040
1041    def __or__(self, other):
1042        return OrCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
1043
1044
1045class AndCombinator(ChainedCombinator):
1046    """
1047    <par>An <class>AndCombinator</class> is a <class>ChainedCombinator</class> where
1048    the node must match all of the combined selectors to match the <class>AndCombinator</class>.
1049    An <class>AndCombinator</class> can be created with the binary and operator (<lit>&amp;</lit>).</par>
1050
1051    <example>
1052    <tty>
1053    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1054    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1055    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1056    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.input & xfind.hasattr("id")</em>):</input>
1057    <prompt>... </prompt><input>\tprint node.bytes()</input>
1058    <prompt>... </prompt><input></input>
1059    <![CDATA[<input id="domains" name="domains" value="www.python.org" type="hidden" />
1060    <input id="sitesearch" name="sitesearch" value="www.python.org" type="hidden" />
1061    <input id="sourceid" name="sourceid" value="google-search" type="hidden" />
1062    <input id="q" class="input-text" name="q" type="text" />
1063    <input id="submit" value="search" name="submit" type="submit" class="input-button" />]]>
1064    </tty>
1065    </example>
1066    """
1067
1068    def match(self, path):
1069        return all(selector.match(path) for selector in self.selectors)
1070
1071    def __and__(self, other):
1072        return AndCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
1073
1074    symbol = " & "
1075
1076
1077class NotCombinator(Combinator):
1078    """
1079    <par>A <class>NotCombinator</class> inverts the selection logic of the
1080    underlying selector, i.e. a node matches only if it does not match the underlying
1081    selector. A <class>NotCombinator</class> can be created with the unary inversion operator (<lit>~</lit>).</par>
1082
1083    <par>The following example outputs all images that don't have a <lit>border</lit> attribute:</par>
1084
1085    <example>
1086    <tty>
1087    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1088    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1089    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1090    <prompt>>>> </prompt><input>for node in doc.walknode(<em>html.img &amp; ~xfind.hasattr("border")</em>):</input>
1091    <prompt>... </prompt><input>\tprint node.bytes()</input>
1092    <prompt>... </prompt><input></input>
1093    <![CDATA[<img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />]]>
1094    </tty>
1095    </example>
1096    """
1097
1098    def __init__(self, selector):
1099        self.selector = selector
1100
1101    def match(self, path):
1102        return not self.selector.match(path)
1103
1104    def __str__(self):
1105        if isinstance(self.selector, Combinator) and not isinstance(self.selector, NotCombinator):
1106            return "~(%s)" % self.selector
1107        else:
1108            return "~%s" % self.selector
1109
1110
1111class CallableSelector(Selector):
1112    """
1113    <par>A <class>CallableSelector</class> is a selector that calls a user specified
1114    callable to select nodes. The callable gets passed the path and must return
1115    a bool specifying whether this path is selected. A <class>CallableSelector</class>
1116    is created implicitely whenever a callable is passed to a method that expects
1117    a walk filter.</par>
1118
1119    <par>The following example outputs all links that point outside the <lit>python.org</lit> domain:</par>
1120
1121    <example>
1122    <tty>
1123    <prompt>>>> </prompt><input>from ll.xist import parsers, xfind</input>
1124    <prompt>>>> </prompt><input>from ll.xist.ns import html</input>
1125    <prompt>>>> </prompt><input>doc = parsers.parseURL("http://www.python.org", tidy=True)</input>
1126    <prompt>>>> </prompt><input>def foreignlink(path):</input>
1127    <prompt>... </prompt><input>    return path and isinstance(path[-1], html.a) and not path[-1].attrs.href.asURL().server.endswith(".python.org")</input>
1128    <prompt>... </prompt><input></input>
1129    <prompt>>>> </prompt><input>for node in doc.walknode(<em>foreignlink</em>):</input>
1130    <prompt>... </prompt><input>\tprint node.bytes()</input>
1131    <prompt>... </prompt><input></input>
1132    <![CDATA[<a href="http://homegain.com/" class="reference">HomeGain.com</a>
1133    <a href="http://www.zope.org/">Zope</a>
1134    <a href="http://www.djangoproject.com/">Django</a>
1135    <a href="http://www.turbogears.org/">TurboGears</a>
1136    <a href="http://pyxml.sourceforge.net/topics/">XML</a>]]>
1137    <rep>..</rep>
1138    </tty>
1139    </example>
1140    """
1141    def __init__(self, func):
1142        self.func = func
1143
1144    def match(self, path):
1145        return self.func(path)
1146
1147    def __str__(self):
1148        return "%s(%r)" % (self.__class__.__name__, self.func)
1149
1150
1151class nthchild(Selector):
1152    def __init__(self, index):
1153        self.index = index
1154
1155    def match(self, path):
1156        if len(path) >= 2:
1157            if self.index in ("even", "odd"):
1158                for (i, child) in enumerate(path[-2]):
1159                    if child is path[-1]:
1160                        return (i % 2) == (self.index == "odd")
1161            else:
1162                try:
1163                    return path[-2][self.index] is path[-1]
1164                except IndexError:
1165                    return False
1166        return False
1167
1168    def __str__(self):
1169        return "%s(%r)" % (self.__class__.__name__, self.index)
1170
1171
1172class nthoftype(Selector):
1173    def __init__(self, index, *types):
1174        self.index = index
1175        self.types = types
1176
1177    def _find(self, path):
1178        types = self.types if self.types else path[-1].__class__
1179        for child in path[-2]:
1180            if isinstance(child, types):
1181                yield child
1182
1183    def match(self, path):
1184        if len(path) >= 2:
1185            if self.index in ("even", "odd"):
1186                for (i, child) in enumerate(self._find(path)):
1187                    if child is path[-1]:
1188                        return (i % 2) == (self.index == "odd")
1189            else:
1190                try:
1191                    return misc.item(self._find(path), self.index) is path[-1]
1192                except IndexError:
1193                    return False
1194        return False
1195
1196    def __str__(self):
1197        if self.types:
1198            return "%s(%r, %s)" % (self.__class__.__name__, self.index, ", ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types))
1199        else:
1200            return "%s(%r)" % (self.__class__.__name__, self.index)
1201
1202
1203###
1204### CSS helper functions
1205###
1206
1207def _is_nth_node(iterator, node, index):
1208    # Return whether node is the index'th node in iterator (starting at 1)
1209    # index is an int or int string or "even" or "odd"
1210    if index == "even":
1211        for (i, child) in enumerate(iterator):
1212            if child is node:
1213                return i % 2 == 1
1214        return False
1215    elif index == "odd":
1216        for (i, child) in enumerate(iterator):
1217            if child is node:
1218                return i % 2 == 0
1219        return False
1220    else:
1221        if not isinstance(index, (int, long)):
1222            try:
1223                index = int(index)
1224            except ValueError:
1225                raise ValueError("illegal argument %r" % index)
1226            else:
1227                if index < 1:
1228                    return False
1229        try:
1230            return iterator[index-1] is node
1231        except IndexError:
1232            return False
1233
1234
1235def _is_nth_last_node(iterator, node, index):
1236    # Return whether node is the index'th last node in iterator
1237    # index is an int or int string or "even" or "odd"
1238    if index == "even":
1239        pos = None
1240        for (i, child) in enumerate(iterator):
1241            if child is node:
1242                pos = i
1243        return pos is None or (i-pos) % 2 == 1
1244    elif index == "odd":
1245        pos = None
1246        for (i, child) in enumerate(iterator):
1247            if child is node:
1248                pos = i
1249        return pos is None or (i-pos) % 2 == 0
1250    else:
1251        if not isinstance(index, (int, long)):
1252            try:
1253                index = int(index)
1254            except ValueError:
1255                raise ValueError("illegal argument %r" % index)
1256            else:
1257                if index < 1:
1258                    return False
1259        try:
1260            return iterator[-index] is node
1261        except IndexError:
1262            return False
1263
1264
1265def _children_of_type(node, type):
1266    for child in node:
1267        if isinstance(child, xsc.Element) and child.xmlname == type:
1268            yield child
1269
1270
1271###
1272### CSS selectors
1273###
1274
1275class CSSHasAttributeSelector(Selector):
1276    def __init__(self, attributename):
1277        self.attributename = attributename
1278
1279    def match(self, path):
1280        if path:
1281            node = path[-1]
1282            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attributename):
1283                return node.attrs.has_xml(self.attributename)
1284        return False
1285
1286    def __str__(self):
1287        return "%s(%r)" % (self.__class__.__name__, self.attributename)
1288
1289
1290class CSSAttributeListSelector(Selector):
1291    def __init__(self, attributename, attributevalue):
1292        self.attributename = attributename
1293        self.attributevalue = attributevalue
1294
1295    def match(self, path):
1296        if path:
1297            node = path[-1]
1298            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attributename):
1299                attr = node.attrs.get_xml(self.attributename)
1300                return self.attributevalue in unicode(attr).split()
1301        return False
1302
1303    def __str__(self):
1304        return "%s(%r, %r)" % (self.__class__.__name__, self.attributename, self.attributevalue)
1305
1306
1307class CSSAttributeLangSelector(Selector):
1308    def __init__(self, attributename, attributevalue):
1309        self.attributename = attributename
1310        self.attributevalue = attributevalue
1311
1312    def match(self, path):
1313        if path:
1314            node = path[-1]
1315            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attributename):
1316                attr = node.attrs.get_xml(self.attributename)
1317                parts = unicode(attr).split("-", 1)
1318                if parts:
1319                    return parts[0] == self.attributevalue
1320        return False
1321
1322    def __str__(self):
1323        return "%s(%r, %r)" % (self.__class__.__name__, self.attributename, self.attributevalue)
1324
1325
1326class CSSFirstChildSelector(Selector):
1327    def match(self, path):
1328        return len(path) >= 2 and _is_nth_node(path[-2][xsc.Element], path[-1], 1)
1329
1330    def __str__(self):
1331        return "CSSFirstChildSelector()"
1332
1333
1334class CSSLastChildSelector(Selector):
1335    def match(self, path):
1336        return len(path) >= 2 and _is_nth_last_node(path[-2][xsc.Element], path[-1], 1)
1337
1338    def __str__(self):
1339        return "CSSLastChildSelector()"
1340
1341
1342class CSSFirstOfTypeSelector(Selector):
1343    def match(self, path):
1344        if len(path) >= 2:
1345            node = path[-1]
1346            return isinstance(node, xsc.Element) and _is_nth_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, 1)
1347        return False
1348
1349    def __str__(self):
1350        return "CSSFirstOfTypeSelector()"
1351
1352
1353class CSSLastOfTypeSelector(Selector):
1354    def match(self, path):
1355        if len(path) >= 2:
1356            node = path[-1]
1357            return isinstance(node, xsc.Element) and _is_nth_last_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, 1)
1358        return False
1359
1360    def __str__(self):
1361        return "CSSLastOfTypeSelector()"
1362
1363
1364class CSSOnlyChildSelector(Selector):
1365    def match(self, path):
1366        if len(path) >= 2:
1367            node = path[-1]
1368            if isinstance(node, xsc.Element):
1369                for child in path[-2][xsc.Element]:
1370                    if child is not node:
1371                        return False
1372                return True
1373        return False
1374
1375    def __str__(self):
1376        return "CSSOnlyChildSelector()"
1377
1378
1379class CSSOnlyOfTypeSelector(Selector):
1380    def match(self, path):
1381        if len(path) >= 2:
1382            node = path[-1]
1383            if isinstance(node, xsc.Element):
1384                for child in _children_of_type(path[-2], node.xmlname):
1385                    if child is not node:
1386                        return False
1387                return True
1388        return False
1389
1390    def __str__(self):
1391        return "CSSOnlyOfTypeSelector()"
1392
1393
1394class CSSEmptySelector(Selector):
1395    def match(self, path):
1396        if path:
1397            node = path[-1]
1398            if isinstance(node, xsc.Element):
1399                for child in path[-1].content:
1400                    if isinstance(child, xsc.Element) or (isinstance(child, xsc.Text) and child):
1401                        return False
1402                return True
1403        return False
1404
1405    def __str__(self):
1406        return "CSSEmptySelector()"
1407
1408
1409class CSSRootSelector(Selector):
1410    def match(self, path):
1411        return len(path) == 1 and isinstance(path[-1], xsc.Element)
1412
1413    def __str__(self):
1414        return "CSSRootSelector()"
1415
1416
1417class CSSFunctionSelector(Selector):
1418    def __init__(self, value=None):
1419        self.value = value
1420
1421    def __str__(self):
1422        return "%s(%r)" % (self.__class__.__name__, self.value)
1423
1424
1425class CSSNthChildSelector(CSSFunctionSelector):
1426    def match(self, path):
1427        if len(path) >= 2:
1428            node = path[-1]
1429            if isinstance(node, xsc.Element):
1430                return _is_nth_node(path[-2][xsc.Element], node, self.value)
1431        return False
1432
1433
1434class CSSNthLastChildSelector(CSSFunctionSelector):
1435    def match(self, path):
1436        if len(path) >= 2:
1437            node = path[-1]
1438            if isinstance(node, xsc.Element):
1439                return _is_nth_last_node(path[-2][xsc.Element], node, self.value)
1440        return False
1441
1442
1443class CSSNthOfTypeSelector(CSSFunctionSelector):
1444    def match(self, path):
1445        if len(path) >= 2:
1446            node = path[-1]
1447            if isinstance(node, xsc.Element):
1448                return _is_nth_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, self.value)
1449        return False
1450
1451
1452class CSSNthLastOfTypeSelector(CSSFunctionSelector):
1453    def match(self, path):
1454        if len(path) >= 2:
1455            node = path[-1]
1456            if isinstance(node, xsc.Element):
1457                return _is_nth_last_node(misc.Iterator(_children_of_type(path[-2], node.xmlname)), node, self.value)
1458        return False
1459
1460
1461class CSSTypeSelector(Selector):
1462    def __init__(self, type="*", xmlns="*", *selectors):
1463        self.type = type
1464        self.xmlns = xsc.nsname(xmlns)
1465        self.selectors = [] # id, class, attribute etc. selectors for this node
1466
1467    def match(self, path):
1468        if not path:
1469            return False
1470        node = path[-1]
1471        if self.type != "*" and node.xmlname != self.type:
1472            return False
1473        if self.xmlns != "*" and node.xmlns != self.xmlns:
1474            return False
1475        for selector in self.selectors:
1476            if not selector.match(path):
1477                return False
1478        return True
1479
1480    def __str__(self):
1481        v = [self.__class__.__name__, "("]
1482        if self.type != "*" or self.xmlns != "*" or self.selectors:
1483            v.append(repr(self.type))
1484        if self.xmlns != "*" or self.selectors:
1485            v.append(", ")
1486            v.append(repr(self.xmlns))
1487        for selector in self.selectors:
1488            v.append(", ")
1489            v.append(str(selector))
1490        v.append(")")
1491        return "".join(v)
1492
1493
1494class CSSAdjacentSiblingCombinator(BinaryCombinator):
1495    """
1496    <par>A <class>CSSAdjacentSiblingCombinator</class> work similar to an
1497    <class>AdjacentSiblingCombinator</class> except that only preceding elements
1498    are considered.</par>
1499    """
1500
1501    def match(self, path):
1502        if len(path) >= 2 and self.right.match(path):
1503            # Find sibling
1504            node = path[-1]
1505            sibling = None
1506            for child in path[-2][xsc.Element]:
1507                if child is node:
1508                    break
1509                sibling = child
1510            if sibling is not None:
1511                return self.left.match(path[:-1]+[sibling])
1512        return False
1513
1514    def __str__(self):
1515        return "%s(%s, %s)" % (self.__class__.__name__, self.left, self.right)
1516
1517
1518class CSSGeneralSiblingCombinator(BinaryCombinator):
1519    """
1520    <par>A <class>CSSGeneralSiblingCombinator</class> work similar to an
1521    <class>GeneralSiblingCombinator</class> except that only preceding elements
1522    are considered.</par>
1523    """
1524
1525    def match(self, path):
1526        if len(path) >= 2 and self.right.match(path):
1527            node = path[-1]
1528            for child in path[-2][xsc.Element]:
1529                if child is node: # no previous element siblings
1530                    return False
1531                if self.left.match(path[:-1]+[child]):
1532                    return True
1533        return False
1534
1535    def __str__(self):
1536        return "%s(%s, %s)" % (self.__class__.__name__, self.left, self.right)
1537
1538
1539_attributecombinator2class = {
1540    "=": attrhasvalue_xml,
1541    "~=": CSSAttributeListSelector,
1542    "|=": CSSAttributeLangSelector,
1543    "^=": attrstartswith_xml,
1544    "$=": attrendswith_xml,
1545    "*=": attrcontains_xml,
1546}
1547
1548_combinator2class = {
1549    " ": DescendantCombinator,
1550    ">": ChildCombinator,
1551    "+": CSSAdjacentSiblingCombinator,
1552    "~": CSSGeneralSiblingCombinator,
1553}
1554
1555_pseudoname2class = {
1556    "first-child": CSSFirstChildSelector,
1557    "last-child": CSSLastChildSelector,
1558    "first-of-type": CSSFirstOfTypeSelector,
1559    "last-of-type": CSSLastOfTypeSelector,
1560    "only-child": CSSOnlyChildSelector,
1561    "only-of-type": CSSOnlyOfTypeSelector,
1562    "empty": CSSEmptySelector,
1563    "root": CSSRootSelector,
1564}
1565
1566_function2class = {
1567    "nth-child": CSSNthChildSelector,
1568    "nth-last-child": CSSNthLastChildSelector,
1569    "nth-of-type": CSSNthOfTypeSelector,
1570    "nth-last-of-type": CSSNthLastOfTypeSelector,
1571}
1572
1573
1574def css(selectors, prefixes=None):
1575    """
1576    Create a walk filter that will yield all nodes that match the specified
1577    &css; expression. <arg>selectors</arg> can be a string or a
1578    <class>cssutils.css.selector.Selector</class> object. <arg>prefixes</arg>
1579    may is a mapping mapping namespace prefixes to namespace names.
1580    """
1581       
1582    if isinstance(selectors, basestring):
1583        if prefixes is not None:
1584            prefixes = dict((key, xsc.nsname(value)) for (key, value) in prefixes.iteritems())
1585            selectors = "%s\n%s{}" % ("\n".join("@namespace %s %r;" % (key if key is not None else "", value) for (key, value) in prefixes.iteritems()), selectors)
1586        else:
1587            selectors = "%s{}" % selectors
1588        for rule in cssutils.CSSParser().parseString(selectors).cssRules:
1589            if isinstance(rule, cssstylerule.CSSStyleRule):
1590                selectors = rule.selectorList
1591                break
1592        else:
1593            raise ValueError("can't happen")
1594    else:
1595        raise TypeError # FIXME: cssutils object
1596    orcombinators = []
1597    for selector in selectors:
1598        rule = root = CSSTypeSelector()
1599        prefix = None
1600        attributename = None
1601        attributevalue = None
1602        combinator = None
1603        inattr = False
1604        for x in selector.seq:
1605            type = x["type"]
1606            value = x["value"]
1607            if type == "prefix":
1608                prefix = value
1609            elif type == "pipe":
1610                if prefix != "*":
1611                    try:
1612                        xmlns = prefixes[prefix]
1613                    except KeyError:
1614                        raise xsc.IllegalPrefixError(prefix)
1615                    rule.xmlns = xmlns
1616                prefix = None
1617            elif type == "type":
1618                rule.type = value
1619            elif type == "id":
1620                rule.selectors.append(hasid(value.lstrip("#")))
1621            elif type == "classname":
1622                rule.selectors.append(hasclass(value))
1623            elif type == "pseudoname":
1624                try:
1625                    rule.selectors.append(_pseudoname2class[value]())
1626                except KeyError:
1627                    raise ValueError("unknown pseudoname %s" % value)
1628            elif type == "function":
1629                try:
1630                    rule.selectors.append(_function2class[value.rstrip("(")]())
1631                except KeyError:
1632                    raise ValueError("unknown function %s" % value)
1633                rule.function = value
1634            elif type == "functionvalue":
1635                rule.selectors[-1].value = value
1636            elif type == "attributename":
1637                attributename = value
1638            elif type == "attributevalue":
1639                if value.startswith("'") and value.endswith("'"):
1640                    value = value[1:-1]
1641                elif value.startswith('"') and value.endswith('"'):
1642                    value = value[1:-1]
1643                attributevalue = value
1644            elif type == "attribute selector":
1645                combinator = None
1646                inattr = True
1647            elif type == "attribute selector end":
1648                if combinator is None:
1649                    rule.selectors.append(CSSHasAttributeSelector(attributename))
1650                else:
1651                    try:
1652                        rule.selectors.append(_attributecombinator2class[combinator](attributename, attributevalue))
1653                    except KeyError:
1654                        raise ValueError("unknown combinator %s" % attributevalue)
1655                inattr = False
1656            elif type == "combinator":
1657                if inattr:
1658                    combinator = value
1659                else:
1660                    try:
1661                        rule = CSSTypeSelector()
1662                        root = _combinator2class[value](root, rule)
1663                    except KeyError:
1664                        raise ValueError("unknown combinator %s" % value)
1665                    xmlns = "*"
1666        orcombinators.append(root)
1667    return orcombinators[0] if len(orcombinators) == 1 else OrCombinator(*orcombinators)
Note: See TracBrowser for help on using the browser.