root/livinglogic.python.xist/src/ll/xist/xfind.py @ 3180:c414962d9e75

Revision 3180:c414962d9e75, 38.7 KB (checked in by Walter Doerwald <walter@…>, 12 years ago)

Convert all docstrings to ReST.

Line 
1# -*- coding: utf-8 -*-
2
3## Copyright 1999-2008 by LivingLogic AG, Bayreuth/Germany
4## Copyright 1999-2008 by Walter Dörwald
5##
6## All Rights Reserved
7##
8## See xist/__init__.py for the license
9
10
11"""
12This module contains XFind and CSS selectors and related classes and functions.
13
14A selector is a XIST tree traversal filter that traverses the complete XML tree
15and outputs those nodes specified by the selector. Selectors can be combined
16with various operations and form a language comparable to XPath__ but
17implemented as Python expressions.
18
19__ http://www.w3.org/TR/xpath
20"""
21
22
23from ll import misc
24from ll.xist import xsc
25
26
27__docformat__ = "reStructuredText"
28
29
30class CSSWeight(tuple):
31    """
32    The specificity of a CSS selector as a 3-item tuple as specified by CSS3__.
33   
34    __ http://www.w3.org/TR/css3-selectors/#specificity
35    """
36
37    def __new__(cls, a=0, b=0, c=0, d=0):
38        return tuple.__new__(cls, (a, b, c, d))
39
40    def __add__(self, other):
41        return CSSWeight(self[0]+other[0], self[1]+other[1], self[2]+other[2], self[3]+other[3])
42
43    def __repr__(self):
44        return "CSSWeight(%r, %r, %r, %r)" % (self[0], self[1], self[2], self[3])
45
46
47class Selector(xsc.WalkFilter):
48    """
49    Base class for all tree traversal filters that visit the complete tree.
50    Whether a node gets output can be specified by overwriting the
51    :meth:`matchpath` method. Selectors can be combined with various operations
52    (see methods below).
53    """
54
55    @misc.notimplemented
56    def matchpath(self, path):
57        pass
58
59    def filterpath(self, path):
60        return (True, xsc.entercontent, xsc.enterattrs) if self.matchpath(path) else (xsc.entercontent, xsc.enterattrs)
61
62    def __div__(self, other):
63        """
64        Create a :class:`ChildCombinator` with :var:`self` as the left hand
65        selector and :var:`other` as the right hand selector.
66        """
67        return ChildCombinator(self, xsc.makewalkfilter(other))
68
69    def __floordiv__(self, other):
70        """
71        Create a :class:`DescendantCombinator` with :var:`self` as the left hand
72        selector and :var:`other` as the right hand selector.
73        """
74        return DescendantCombinator(self, xsc.makewalkfilter(other))
75
76    def __mul__(self, other):
77        """
78        Create an :class:`AdjacentSiblingCombinator` with :var:`self` as the left
79        hand selector and :var:`other` as the right hand selector.
80        """
81        return AdjacentSiblingCombinator(self, xsc.makewalkfilter(other))
82
83    def __pow__(self, other):
84        """
85        Create a :class:`GeneralSiblingCombinator` with :var:`self` as the left
86        hand selector and :var:`other` as the right hand selector.
87        """
88        return GeneralSiblingCombinator(self, xsc.makewalkfilter(other))
89
90    def __and__(self, other):
91        """
92        Create an :class:`AndCombinator` from :var:`self` and :var:`other`.
93        """
94        return AndCombinator(self, xsc.makewalkfilter(other))
95
96    def __or__(self, other):
97        """
98        Create an :class:`OrCombinator` from :var:`self` and :var:`other`.
99        """
100        return OrCombinator(self, xsc.makewalkfilter(other))
101
102    def __invert__(self):
103        """
104        Create a :class:`NotCombinator` inverting :var:`self`.
105        """
106        return NotCombinator(self)
107
108    def cssweight(self):
109        """
110        Return the CSS specificity of :var:`self` as a :class:`CSSWeight` object.
111        """
112        return CSSWeight()
113
114
115class IsInstanceSelector(Selector):
116    """
117    Selector that selects all nodes that are instances of the specified type.
118    You can either create an :class:`IsInstanceSelector` object directly
119    or simply pass a class to a function that expects a walk filter (this class
120    will be automatically wrapped in an :class:`IsInstanceSelector`)::
121
122        >>> from ll.xist import parsers, xfind
123        >>> from ll.xist.ns import html
124        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
125        >>> for node in doc.walknode(html.a):
126        ...     print node.bytes()
127        ...
128        <a id="logolink" accesskey="1" href="http://www.python.org/"><img src="http://www.python.org/images/python-logo.gif" id="logo" border="0" alt="homepage" /></a>
129        <a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" src="http://www.python.org/images/trans.gif" border="0" alt="skip to navigation" /></a>
130        <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" src="http://www.python.org/images/trans.gif" border="0" alt="skip to content" /></a>
131        <a class="reference" href="http://www.python.org/search">Advanced Search</a>
132        <a title="About The Python Language" href="http://www.python.org/about/">About</a>
133        ...
134    """
135    def __init__(self, *types):
136        self.types = types
137
138    def matchpath(self, path):
139        if path:
140            return isinstance(path[-1], self.types)
141        return False
142
143    def __or__(self, other):
144        # If other is a type check too, combine self and other into one isinstance instance
145        if isinstance(other, xsc._Node_Meta):
146            return IsInstanceSelector(*(self.types + (other,)))
147        elif isinstance(other, IsInstanceSelector):
148            return IsInstanceSelector(*(self.types+other.types))
149        return Selector.__or__(self, other)
150
151    def __getitem__(self, index):
152        """
153        Return an :class:`nthoftype` selector that uses :var:`index` as the
154        index and ``self.types`` as the types.
155        """
156        return nthoftype(index, *self.types)
157
158    def __str__(self):
159        if len(self.types) == 1:
160            return "%s.%s" % (self.types[0].__module__, self.types[0].__name__)
161        else:
162            return "(%s)" % " | ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types)
163
164
165class hasname(Selector):
166    """
167    Selector that selects all nodes that have a specified Python name (which
168    only selects elements, processing instructions and entities). Also a namespace
169    name can be specified as a second argument, which will only select elements
170    from the specified namespace::
171
172        >>> from ll.xist import parsers, xfind
173        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
174        >>> for node in doc.walknode(xfind.hasname("img")):
175        ...     print node.bytes()
176        ...
177        <img border="0" src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" />
178        <img border="0" id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" />
179        <img border="0" id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" />
180        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
181    """
182    def __init__(self, name, xmlns=None):
183        self.name = name
184        self.xmlns = xsc.nsname(xmlns)
185
186    def matchpath(self, path):
187        if path:
188            node = path[-1]
189            if self.xmlns is not None:
190                return isinstance(node, xsc.Element) and node.__class__.__name__ == self.name and node.xmlns == self.xmlns
191            else:
192                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.__class__.__name__ == self.name
193        return False
194
195    def __str__(self):
196        return "%s(%r)" % (self.__class__.__name__, self.name)
197
198
199class hasname_xml(Selector):
200    """
201    :class:`hasname_xml` works similar to :class:`hasname` except that the
202    specified name is treated as the XML name, not the Python name.
203    """
204    def __init__(self, name, xmlns=None):
205        self.name = name
206        self.xmlns = xsc.nsname(xmlns)
207
208    def matchpath(self, path):
209        if path:
210            node = path[-1]
211            if self.xmlns is not None:
212                return isinstance(node, xsc.Element) and node.xmlname == self.name and node.xmlns == self.xmlns
213            else:
214                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.xmlname == self.name
215        return False
216
217    def __str__(self):
218        return "%s(%r)" % (self.__class__.__name__, self.name)
219
220
221class IsSelector(Selector):
222    """
223    Selector that selects one specific node in the tree. This can be combined
224    with other selectors via :class:`ChildCombinator` or
225    :class:`DescendantCombinator` selectors to select children of this specific
226    node. You can either create an :class:`IsSelector` directly or simply pass
227    a node to a function that expects a walk filter::
228
229        >>> from ll.xist import parsers, xfind
230        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
231        >>> for node in doc.walknode(doc[0]/xsc.Element):
232        ...     print repr(node)
233        ...
234        <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6c82f4c>
235        <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6c3154c>
236    """
237    def __init__(self, node):
238        self.node = node
239
240    def matchpath(self, path):
241        return path and path[-1] is self.node
242
243    def __str__(self):
244        return "%s(%r)" % (self.__class__.__name__, self.node)
245
246
247class isroot(Selector):
248    def matchpath(self, path):
249        return len(path) == 1
250
251    def __str__(self):
252        return "isroot"
253
254
255isroot = isroot()
256
257
258class empty(Selector):
259    """
260    Selector that selects all empty elements or fragments::
261
262        >>> from ll.xist import parsers, xfind
263        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
264        >>> for node in doc.walknode(xfind.empty):
265        ...     print node.bytes()
266        ...
267        <meta content="text/html; charset=utf-8" http-equiv="content-type" />
268        <meta content="python programming language object oriented web free source" name="keywords" />
269        <meta content="      Home page for Python, an interpreted, interactive, object-oriented, extensible
270              programming language. It provides an extraordinary combination of clarity and
271              versatility, and is free and comprehensively ported. " name="description" />
272        <a type="application/rss+xml" href="http://www.python.org/channews.rdf" rel="alternate" title="RSS" />
273        ...
274    """
275
276    def matchpath(self, path):
277        if path:
278            node = path[-1]
279            if isinstance(node, (xsc.Element, xsc.Frag)):
280                return len(node) == 0
281        return False
282
283    def __str__(self):
284        return "empty"
285
286
287empty = empty()
288
289
290class onlychild(Selector):
291    """
292    Selector that selects all node that are the only child of their parents::
293
294        >>> from ll.xist import parsers, xfind
295        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
296        >>> for node in doc.walknode(<em>xfind.onlychild & html.a</em>):
297        ...     print node.bytes()
298        ...
299        <a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" /></a>
300        <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" /></a>
301        <a href="http://www.python.org/download/releases/2.5.1">Quick Links (2.5.1)</a>
302        <a title="Manuals for Latest Stable Release" href="http://docs.python.org/">Documentation</a>
303        ...
304    """
305
306    def matchpath(self, path):
307        if len(path) >= 2:
308            parent = path[-2]
309            if isinstance(parent, (xsc.Frag, xsc.Element)):
310                return len(parent)==1 and parent[0] is path[-1]
311        return False
312
313    def __str__(self):
314        return "onlychild"
315
316
317onlychild = onlychild()
318
319
320class onlyoftype(Selector):
321    """
322    Selector that selects all nodes that are the only nodes of their type among
323    their siblings::
324
325        >>> from ll.xist import parsers, xfind
326        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
327        >>> for node in doc.walknode(xfind.onlyoftype & xsc.Element):
328        ...     print repr(node)
329        ...
330        <ll.xist.ns.html.html element object (2 children/1 attr) (from http://www.python.org/:4:?) at 0xb6d6e7ec>
331        <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6cc1f8c>
332        <ll.xist.ns.html.title element object (1 child/no attrs) (from http://www.python.org/:8:?) at 0xb6d79b8c>
333        <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6d7282c>
334        ...
335    """
336
337    def matchpath(self, path):
338        if len(path) >= 2:
339            node = path[-1]
340            parent = path[-2]
341            if isinstance(parent, (xsc.Frag, xsc.Element)):
342                for child in parent:
343                    if isinstance(child, node.__class__):
344                        if child is not node:
345                            return False
346                return True
347        return False
348
349    def __str__(self):
350        return "onlyoftype"
351
352
353onlyoftype = onlyoftype()
354
355
356class hasattr(Selector):
357    """
358    Selector that selects all element nodes that have an attribute with one
359    of the specified Python names. For selecting nodes with global attributes
360    the attribute class can be passed::
361
362        >>> from ll.xist import parsers, xfind
363        >>> from ll.xist.ns import html, xml
364        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
365        >>> for node in doc.walknode(xfind.hasattr(xml.Attrs.lang)):
366        ...     print repr(node)
367        ...
368        <ll.xist.ns.html.html element object (2 children/2 attrs) (from http://www.python.org/:4:?) at 0xb6d71d4c>
369    """
370
371    def __init__(self, *attrnames):
372        self.attrnames = attrnames
373
374    def matchpath(self, path):
375        if path:
376            node = path[-1]
377            if isinstance(node, xsc.Element):
378                for attrname in self.attrnames:
379                    if node.Attrs.isallowed(attrname) and node.attrs.has(attrname):
380                        return True
381        return False
382
383    def __str__(self):
384        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
385
386
387class hasattr_xml(Selector):
388    """
389    :class:`hasattr_xml` works similar to :class:`hasattr` except that the
390    specified names are treated as XML names instead of Python names.
391    """
392
393    def __init__(self, *attrnames):
394        self.attrnames = attrnames
395
396    def matchpath(self, path):
397        if path:
398            node = path[-1]
399            if isinstance(node, xsc.Element):
400                for attrname in self.attrnames:
401                    if node.Attrs.isallowed_xml(attrname) and node.attrs.has_xml(attrname):
402                        return True
403        return False
404
405    def __str__(self):
406        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
407
408
409class attrhasvalue(Selector):
410    """
411    Selector that selects all element nodes where an attribute with the
412    specified Python name has the specified value. For global attributes the
413    attribute class can be passed. Note that "fancy" attributes (i.e. those
414    containing non-text) will not be considered::
415
416        >>> from ll.xist import parsers, xfind
417        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
418        >>> for node in doc.walknode(xfind.attrhasvalue("rel", "stylesheet")):
419        ...     print repr(node)
420        ...
421        <a media="screen" type="text/css" href="http://www.python.org/styles/screen-switcher-default.css" rel="stylesheet" id="screen-switcher-stylesheet" />
422        <a media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
423        <a media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />
424    """
425
426    def __init__(self, attrname, attrvalue):
427        self.attrname = attrname
428        self.attrvalue = attrvalue
429
430    def matchpath(self, path):
431        if path:
432            node = path[-1]
433            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
434                attr = node.attrs.get(self.attrname)
435                if not attr.isfancy(): # if there are PIs, say no
436                    return unicode(attr) == self.attrvalue
437        return False
438
439    def __str__(self):
440        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
441
442
443class attrhasvalue_xml(Selector):
444    """
445    :class:`attrhasvalue_xml` works similar to :class:`attrhasvalue` except that
446    the specified name is treated as an XML name instead of a Python name.
447    """
448
449    def __init__(self, attrname, attrvalue):
450        self.attrname = attrname
451        self.attrvalue = attrvalue
452
453    def matchpath(self, path):
454        if path:
455            node = path[-1]
456            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
457                attr = node.attrs.get_xml(self.attrname)
458                if not attr.isfancy(): # if there are PIs, say no
459                    return unicode(attr) == self.attrvalue
460        return False
461
462    def __str__(self):
463        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
464
465
466class attrcontains(Selector):
467    """
468    Selector that selects all element nodes where an attribute with the
469    specified Python name contains the specified substring in its value. For
470    global attributes the attribute class can be passed. Note that "fancy"
471    attributes (i.e. those containing non-text) will not be considered::
472
473        >>> from ll.xist import parsers, xfind
474        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
475        >>> for node in doc.walknode(xfind.attrcontains("rel", "stylesheet")):
476        ... \tprint repr(node)
477        ...
478        <a type="text/css" id="screen-switcher-stylesheet" media="screen" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
479        <a type="text/css" media="scReen" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
480        <a type="text/css" media="print" rel="stylesheet" href="http://www.python.org/styles/print.css" />
481        <a type="text/css" title="large text" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
482        <a type="text/css" title="default fonts" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />
483    """
484
485    def __init__(self, attrname, attrvalue):
486        self.attrname = attrname
487        self.attrvalue = attrvalue
488
489    def matchpath(self, path):
490        if path:
491            node = path[-1]
492            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
493                attr = node.attrs.get(self.attrname)
494                if not attr.isfancy(): # if there are PIs, say no
495                    return self.attrvalue in unicode(attr)
496        return False
497
498    def __str__(self):
499        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
500
501
502class attrcontains_xml(Selector):
503    """
504    :class:`attrcontains_xml` works similar to :class:`attrcontains` except that
505    the specified name is treated as an XML name instead of a Python name.
506    """
507
508    def __init__(self, attrname, attrvalue):
509        self.attrname = attrname
510        self.attrvalue = attrvalue
511
512    def matchpath(self, path):
513        if path:
514            node = path[-1]
515            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
516                attr = node.attrs.get_xml(self.attrname)
517                if not attr.isfancy(): # if there are PIs, say no
518                    return self.attrvalue in unicode(attr)
519        return False
520
521    def __str__(self):
522        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
523
524
525class attrstartswith(Selector):
526    """
527    Selector that selects all element nodes where an attribute with the
528    specified Python name starts with the specified string. For global
529    attributes the attribute class can be passed. Note that "fancy" attributes
530    (i.e. those containing non-text) will not be considered::
531
532        >>> from ll.xist import parsers, xfind
533        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
534        >>> for node in doc.walknode(xfind.attrstartswith("class_", "input-")):
535        ...     print repr(node)
536        ...
537        <input class="input-text" id="q" type="text" name="q" />
538        <input value="search" class="input-button" id="submit" type="submit" name="submit" />
539    """
540
541    def __init__(self, attrname, attrvalue):
542        self.attrname = attrname
543        self.attrvalue = attrvalue
544
545    def matchpath(self, path):
546        if path:
547            node = path[-1]
548            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
549                attr = node.attrs.get(self.attrname)
550                if not attr.isfancy(): # if there are PIs, say no
551                    return unicode(attr).startswith(self.attrvalue)
552        return False
553
554    def __str__(self):
555        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
556
557
558class attrstartswith_xml(Selector):
559    """
560    :class:`attrstartswith_xml` works similar to :class:`attrstartswith` except
561    that the specified name is treated as an XML name instead of a Python name.
562    """
563
564    def __init__(self, attrname, attrvalue):
565        self.attrname = attrname
566        self.attrvalue = attrvalue
567
568    def matchpath(self, path):
569        if path:
570            node = path[-1]
571            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
572                attr = node.attrs.get_xml(self.attrname)
573                if not attr.isfancy(): # if there are PIs, say no
574                    return unicode(attr).startswith(self.attrvalue)
575        return False
576
577    def __str__(self):
578        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
579
580
581class attrendswith(Selector):
582    """
583    Selector that selects all element nodes where an attribute with the
584    specified Python name ends with the specified string. For global attributes
585    the attribute class can be passed. Note that "fancy" attributes (i.e. those
586    containing non-text) will not be considered::
587
588        >>> from ll.xist import parsers, xfind
589        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
590        >>> for node in doc.walknode(xfind.attrendswith("href", ".css")):
591        ...     print repr(node)
592        ...
593        <a href="http://www.python.org/styles/screen-switcher-default.css" type="text/css" rel="stylesheet" id="screen-switcher-stylesheet" media="screen" />
594        <a type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" media="scReen" />
595        <a type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" media="print" />
596        <a title="large text" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" media="screen" />
597        <a title="default fonts" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" media="screen" />
598    """
599
600    def __init__(self, attrname, attrvalue):
601        self.attrname = attrname
602        self.attrvalue = attrvalue
603
604    def matchpath(self, path):
605        if path:
606            node = path[-1]
607            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
608                attr = node.attrs.get(self.attrname)
609                if not attr.isfancy(): # if there are PIs, say no
610                    return unicode(attr).endswith(self.attrvalue)
611        return False
612
613    def __str__(self):
614        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
615
616
617class attrendswith_xml(Selector):
618    """
619    :class:`attrendswith_xml` works similar to :class:`attrendswith` except that
620    the specified name is treated as an XML name instead of a Python name.
621    """
622
623    def __init__(self, attrname, attrvalue):
624        self.attrname = attrname
625        self.attrvalue = attrvalue
626
627    def matchpath(self, path):
628        if path:
629            node = path[-1]
630            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
631                attr = node.attrs.get_xml(self.attrname)
632                if not attr.isfancy(): # if there are PIs, say no
633                    return unicode(attr).endswith(self.attrvalue)
634        return False
635
636    def __str__(self):
637        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
638
639
640class hasid(Selector):
641    """
642    Selector that selects all element nodes where the ``id`` attribute has the
643    specified value::
644
645        >>> from ll.xist import parsers, xfind
646        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
647        >>> for node in doc.walknode(xfind.hasid("logo")):
648        ...     print node.bytes()
649        ...
650        <img src="http://www.python.org/images/python-logo.gif" id="logo" alt="homepage" border="0" />
651    """
652
653    def __init__(self, id):
654        self.id = id
655
656    def matchpath(self, path):
657        if path:
658            node = path[-1]
659            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("id"):
660                attr = node.attrs.get_xml("id")
661                if not attr.isfancy():
662                    return unicode(attr) == self.id
663        return False
664
665    def __str__(self):
666        return "%s(%r)" % (self.__class__.__name__, self.id)
667
668    def cssweight(self):
669        return CSSWeight(0, 1, 0, 0)
670
671
672class hasclass(Selector):
673    """
674    Selector that selects all element nodes where the ``class`` attribute has
675    the specified value::
676
677        >>> from ll.xist import parsers, xfind
678        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
679        >>> for node in doc.walknode(xfind.hasclass("reference")<):
680        ...     print node.bytes()
681        ...
682        <a class="reference" href="http://www.python.org/search">Advanced Search</a>
683        <a href="http://www.python.org/about/success/rackspace" class="reference">Rackspace</a>
684        <a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
685        <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
686        ...
687    """
688
689    def __init__(self, classname):
690        self.classname = classname
691
692    def matchpath(self, path):
693        if path:
694            node = path[-1]
695            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("class"):
696                attr = node.attrs.get_xml("class")
697                if not attr.isfancy():
698                    return self.classname in unicode(attr).split()
699        return False
700
701    def __str__(self):
702        return "%s(%r)" % (self.__class__.__name__, self.classname)
703
704    def cssweight(self):
705        return CSSWeight(0, 0, 1, 0)
706
707
708class inattr(Selector):
709    """
710    Selector that selects all attribute nodes and nodes inside of attributes::
711
712    >>> from ll.xist import parsers, xfind
713    >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
714    >>> for node in doc.walknode(xfind.inattr & xsc.Text):
715    ...     print node.bytes()
716    ...
717    text/html; charset=utf-8
718    content-type
719    python programming language object oriented web free source
720    ...
721    """
722    def matchpath(self, path):
723        return any(isinstance(node, xsc.Attr) for node in path)
724
725    def __str__(self):
726        return "inattr"
727
728
729inattr = inattr()
730
731
732class Combinator(Selector):
733    """
734    <p>A :class:`Combinator` is a selector that transforms one or combines
735    two or more other selectors in a certain way.</p>
736    """
737
738
739class BinaryCombinator(Combinator):
740    """
741    A :class:`BinaryCombinator` is a combinator that combines two selector:
742    the left hand selector and the right hand selector.
743    """
744    symbol = None
745
746    def __init__(self, left, right):
747        self.left = left
748        self.right = right
749
750    def __str__(self):
751        left = str(self.left)
752        if isinstance(self.left, Combinator) and not isinstance(self.left, self.__class__):
753            left = "(%s)" % left
754        right = str(self.right)
755        if isinstance(self.right, Combinator) and not isinstance(self.right, self.__class__):
756            right = "(%s)" % right
757        return "%s%s%s" % (left, self.symbol, right)
758
759    def cssweight(self):
760        return self.left.cssweight()+self.right.cssweight()
761
762
763class ChildCombinator(BinaryCombinator):
764    """
765    A :class:`ChildCombinator` is a :class:`BinaryCombinator`. To match the
766    :class:`ChildCombinator` the node must match the right hand selector and
767    it's immediate parent must match the left hand selector (i.e. it works
768    similar to the ``>`` combinator in CSS or the ``/`` combinator in XPath).
769
770    :class:`ChildCombinator` objects can be created via the division operator
771    (``/``)::
772
773        >>> from ll.xist import parsers, xfind
774        >>> from ll.xist.ns import html
775        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
776        >>> for node in doc.walknode(html.a/html.img):
777        ...     print node.bytes()
778        ...
779        <img src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" border="0" />
780        <img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
781        <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
782        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
783    """
784    def matchpath(self, path):
785        if path and self.right.matchpath(path):
786            return self.left.matchpath(path[:-1])
787        return False
788
789    symbol = " / "
790
791
792class DescendantCombinator(BinaryCombinator):
793    """
794    A :class:`DescendantCombinator` is a :class:`BinaryCombinator`. To match the
795    :class:`DescendantCombinator` the node must match the right hand selector
796    and any of it's ancestor nodes must match the left hand selector (i.e. it
797    works similar to the descendant combinator in CSS or the ``//`` combinator
798    in XPath).
799
800    :class:`DescendantCombinator` objects can be created via the floor division
801    operator (``//``)::
802
803        >>> from ll.xist import parsers, xfind
804        >>> from ll.xist.ns import html
805        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
806        >>> for node in doc.walknode(html.div//html.img):
807        ...     print node.bytes()
808        ...
809        <img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
810        <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
811        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
812    """
813    def matchpath(self, path):
814        if path and self.right.matchpath(path):
815            while path:
816                path = path[:-1]
817                if self.left.matchpath(path):
818                    return True
819        return False
820
821    symbol = " // "
822
823
824class AdjacentSiblingCombinator(BinaryCombinator):
825    """
826    A :class:`AdjacentSiblingCombinator` is a :class:`BinaryCombinator`.
827    To match the :class:`AdjacentSiblingCombinator` the node must match the
828    right hand selector and the immediately preceding sibling must match the
829    left hand selector.
830
831    :class:`AdjacentSiblingCombinator` objects can be created via the
832    multiplication operator (``*``). The following example outputs all links
833    inside those :class:`p` elements that immediately follow a :class:`h2`
834    element::
835
836        >>> from ll.xist import parsers, xfind
837        >>> from ll.xist.ns import html
838        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
839        >>> for node in doc.walknode(html.h2*html.p/html.a):
840        ...     print node.bytes()
841        ...
842        <a href="http://www.scipy.org/SciPy2007" class="reference">SciPy Conference</a>
843        <a href="https://www.enthought.com/scipy07/" class="reference">early registration</a>
844        <a href="http://www.europython.org/sections/registration_issues/how-to-register" class="reference">Online registration</a>
845        <a href="http://europython.org/" class="reference">EuroPython 2007</a>
846        <a href="http://www.osdc.com.au/papers/cfp.html" class="reference">Call For Papers</a>
847        <a href="http://www.swa.hpi.uni-potsdam.de/dls07/" class="reference">DLS 2007</a>
848        <a href="http://pythonpapers.cgpublisher.com/" class="reference">The Python Papers</a>
849        <a href="http://www.pyconuk.org/" class="reference">PyCon UK</a>
850        <a href="http://www.pyconuk.org/submit.html" class="reference">proposals for talks</a>
851        <a href="http://www.pycon.it/registration/" class="reference">registration online</a>
852    """
853
854    def matchpath(self, path):
855        if len(path) >= 2 and self.right.matchpath(path):
856            # Find sibling
857            node = path[-1]
858            sibling = None
859            for child in path[-2]:
860                if child is node:
861                    break
862                sibling = child
863            if sibling is not None:
864                return self.left.matchpath(path[:-1]+[sibling])
865        return False
866
867    symbol = " * "
868
869
870class GeneralSiblingCombinator(BinaryCombinator):
871    """
872    A :class:`GeneralSiblingCombinator` is a :class:`BinaryCombinator`.
873    To match the :class:`GeneralSiblingCombinator` the node must match the
874    right hand selector and any of the preceding siblings must match the left
875    hand selector.
876
877    :class:`AdjacentSiblingCombinator` objects can be created via the
878    exponentiation operator (``**``). The following example outputs all links
879    that are not the first links inside their parent (i.e. they have another
880    link among their preceding siblings)::
881
882        >>> from ll.xist import parsers, xfind
883        >>> from ll.xist.ns import html
884        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
885        >>> for node in doc.walknode(html.a**html.a):
886        ...     print node.bytes()
887        ...
888        <a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
889        <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
890        <a href="http://www.python.org/about/success/honeywell" class="reference">Honeywell</a>
891        <a href="http://www.python.org/about/success" class="reference">and many others</a>
892        <a href="http://www.zope.org/">Zope</a>
893        ...
894    """
895
896    def matchpath(self, path):
897        if len(path) >= 2 and self.right.matchpath(path):
898            node = path[-1]
899            for child in path[-2]:
900                if child is node: # no previous siblings
901                    return False
902                if self.left.matchpath(path[:-1]+[child]):
903                    return True
904        return False
905
906    symbol = " ** "
907
908
909class ChainedCombinator(Combinator):
910    """
911    A :class:`ChainedCombinator` combines any number of other selectors.
912    """
913
914    symbol = None
915
916    def __init__(self, *selectors):
917        self.selectors = selectors
918
919    def __str__(self):
920        v = []
921        for selector in self.selectors:
922            s = str(selector)
923            if isinstance(selector, Combinator) and not isinstance(selector, self.__class__):
924                s = "(%s)" % s
925            v.append(s)
926        return self.symbol.join(v)
927
928    def cssweight(self):
929        raise TypeError("no weight info for chained combinator")
930
931
932class OrCombinator(ChainedCombinator):
933    """
934    An :class:`OrCombinator` is a :class:`ChainedCombinator` where the node must
935    match at least one of the selectors to match the :class:`OrCombinator`. An
936    :class:`OrCombinator` can be created with the binary or operator (``|``)::
937
938        >>> from ll.xist import parsers, xfind
939        >>> from ll.xist.ns import html
940        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
941        >>> for node in doc.walknode(xfind.hasattr("href") | xfind.hasattr("src")):
942        ...     print node.bytes()
943        ...
944        <a type="application/rss+xml" title="RSS" rel="alternate" href="http://www.python.org/channews.rdf" />
945        <a media="screen" type="text/css" id="screen-switcher-stylesheet" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
946        <a media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
947        <a media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />
948        <a media="screen" type="text/css" title="large text" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
949        <a media="screen" type="text/css" title="default fonts" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />
950        <script src="http://www.python.org/js/iotbs2-key-directors-load.js" type="text/javascript"></script>
951        <script src="http://www.python.org/js/iotbs2-directors.js" type="text/javascript"></script>
952        <script src="http://www.python.org/js/iotbs2-core.js" type="text/javascript"></script>
953        <a accesskey="1" id="logolink" href="http://www.python.org/"><img alt="homepage" src="http://www.python.org/images/python-logo.gif" id="logo" border="0" /></a>
954        ...
955    """
956
957    def matchpath(self, path):
958        return any(selector.matchpath(path) for selector in self.selectors)
959
960    symbol = " | "
961
962    def __or__(self, other):
963        return OrCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
964
965
966class AndCombinator(ChainedCombinator):
967    """
968    An :class:`AndCombinator` is a :class:`ChainedCombinator` where the node
969    must match all of the combined selectors to match the :class:`AndCombinator`.
970    An :class:`AndCombinator` can be created with the binary and operator
971    (``&``)::
972
973        >>> from ll.xist import parsers, xfind
974        >>> from ll.xist.ns import html
975        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
976        >>> for node in doc.walknode(html.input & xfind.hasattr("id")):
977        ...     print node.bytes()
978        ...
979        <input id="domains" name="domains" value="www.python.org" type="hidden" />
980        <input id="sitesearch" name="sitesearch" value="www.python.org" type="hidden" />
981        <input id="sourceid" name="sourceid" value="google-search" type="hidden" />
982        <input id="q" class="input-text" name="q" type="text" />
983        <input id="submit" value="search" name="submit" type="submit" class="input-button" />
984    """
985
986    def matchpath(self, path):
987        return all(selector.matchpath(path) for selector in self.selectors)
988
989    def __and__(self, other):
990        return AndCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
991
992    symbol = " & "
993
994
995class NotCombinator(Combinator):
996    """
997    A :class:`NotCombinator` inverts the selection logic of the underlying
998    selector, i.e. a node matches only if it does not match the underlying
999    selector. A :class:`NotCombinator` can be created with the unary inversion
1000    operator (``~``).
1001
1002    The following example outputs all images that don't have a ``border``
1003    attribute::
1004
1005        >>> from ll.xist import parsers, xfind
1006        >>> from ll.xist.ns import html
1007        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
1008        >>> for node in doc.walknode(html.img & ~xfind.hasattr("border")):
1009        ...     print node.bytes()
1010        ...
1011        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
1012    """
1013
1014    def __init__(self, selector):
1015        self.selector = selector
1016
1017    def matchpath(self, path):
1018        return not self.selector.matchpath(path)
1019
1020    def __str__(self):
1021        if isinstance(self.selector, Combinator) and not isinstance(self.selector, NotCombinator):
1022            return "~(%s)" % self.selector
1023        else:
1024            return "~%s" % self.selector
1025
1026
1027class CallableSelector(Selector):
1028    """
1029    A :class:`CallableSelector` is a selector that calls a user specified
1030    callable to select nodes. The callable gets passed the path and must return
1031    a bool specifying whether this path is selected. A :class:`CallableSelector`
1032    is created implicitely whenever a callable is passed to a method that
1033    expects a walk filter.
1034
1035    The following example outputs all links that point outside the ``python.org``
1036    domain::
1037
1038        >>> from ll.xist import parsers, xfind
1039        >>> from ll.xist.ns import html
1040        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
1041        >>> def foreignlink(path):
1042        ...     return path and isinstance(path[-1], html.a) and not path[-1].attrs.href.asURL().server.endswith(".python.org")
1043        ...
1044        >>> for node in doc.walknode(foreignlink):
1045        ...     print node.bytes()
1046        ...
1047        <a href="http://youtube.com/" class="reference">YouTube.com</a>
1048        <a href="http://www.zope.org/">Zope</a>
1049        <a href="http://www.djangoproject.com/">Django</a>
1050        <a href="http://www.turbogears.org/">TurboGears</a>
1051        <a href="http://pyxml.sourceforge.net/topics/">XML</a>
1052        ..
1053    """
1054
1055    def __init__(self, func):
1056        self.func = func
1057
1058    def matchpath(self, path):
1059        return self.func(path)
1060
1061    def __str__(self):
1062        return "%s(%r)" % (self.__class__.__name__, self.func)
1063
1064
1065class nthchild(Selector):
1066    """
1067    An :class:`nthchild` object is a selector that selects every node that is
1068    the n-th child of its parent. E.g. ``nthchild(0)`` selects every first
1069    child, ``nthchild(-1)`` selects each last child. Furthermore
1070    ``nthchild("even")`` selects each first, third, fifth, ... child and
1071    ``nthchild("odd")`` selects each second, fourth, sixth, ... child.
1072    """
1073
1074    def __init__(self, index):
1075        self.index = index
1076
1077    def matchpath(self, path):
1078        if len(path) >= 2:
1079            if self.index in ("even", "odd"):
1080                for (i, child) in enumerate(path[-2]):
1081                    if child is path[-1]:
1082                        return (i % 2) == (self.index == "odd")
1083            else:
1084                try:
1085                    return path[-2][self.index] is path[-1]
1086                except IndexError:
1087                    return False
1088        return False
1089
1090    def __str__(self):
1091        return "%s(%r)" % (self.__class__.__name__, self.index)
1092
1093
1094class nthoftype(Selector):
1095    """
1096    An :class:`nthchild` object is a selector that selects every node that is
1097    the n-th node of a specified type among its siblings. Similar to
1098    :class:`nthchild` :class:`nthoftype` supports negative and positive indices
1099    as well as ``"even"`` and ``"odd"``. Which types are checked can be passed
1100    explicitely. If no types are passed the type of the node itself is used::
1101
1102        >>> from ll.xist import parsers, xfind
1103        >>> from ll.xist.ns import html
1104        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
1105        >>> for node in doc.walknode(xfind.nthoftype(0, html.h2)):
1106        ...     print node.bytes()
1107        ...
1108        <h2 class="news">SciPy 2007 - Conference for Scientific Computing</h2>
1109    """
1110
1111    def __init__(self, index, *types):
1112        self.index = index
1113        self.types = types
1114
1115    def _find(self, path):
1116        types = self.types if self.types else path[-1].__class__
1117        for child in path[-2]:
1118            if isinstance(child, types):
1119                yield child
1120
1121    def matchpath(self, path):
1122        if len(path) >= 2:
1123            if self.index in ("even", "odd"):
1124                for (i, child) in enumerate(self._find(path)):
1125                    if child is path[-1]:
1126                        return (i % 2) == (self.index == "odd")
1127            else:
1128                try:
1129                    return misc.item(self._find(path), self.index) is path[-1]
1130                except IndexError:
1131                    return False
1132        return False
1133
1134    def __str__(self):
1135        if self.types:
1136            return "%s(%r, %s)" % (self.__class__.__name__, self.index, ", ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types))
1137        else:
1138            return "%s(%r)" % (self.__class__.__name__, self.index)
Note: See TracBrowser for help on using the browser.