root/livinglogic.python.xist/src/ll/xist/xfind.py @ 3241:b87e8fa740da

Revision 3241:b87e8fa740da, 37.8 KB (checked in by Walter Doerwald <walter@…>, 12 years ago)

Remove CSSWeight class, methods and tests. Use csstils specificity instead.

Line 
1# -*- coding: utf-8 -*-
2
3## Copyright 1999-2008 by LivingLogic AG, Bayreuth/Germany
4## Copyright 1999-2008 by Walter Dörwald
5##
6## All Rights Reserved
7##
8## See xist/__init__.py for the license
9
10
11"""
12This module contains XFind and CSS selectors and related classes and functions.
13
14A selector is a XIST tree traversal filter that traverses the complete XML tree
15and outputs those nodes specified by the selector. Selectors can be combined
16with various operations and form a language comparable to XPath__ but
17implemented as Python expressions.
18
19__ http://www.w3.org/TR/xpath
20"""
21
22
23from ll import misc
24from ll.xist import xsc
25
26
27__docformat__ = "reStructuredText"
28
29
30class Selector(xsc.WalkFilter):
31    """
32    Base class for all tree traversal filters that visit the complete tree.
33    Whether a node gets output can be specified by overwriting the
34    :meth:`matchpath` method. Selectors can be combined with various operations
35    (see methods below).
36    """
37
38    @misc.notimplemented
39    def matchpath(self, path):
40        pass
41
42    def filterpath(self, path):
43        return (True, xsc.entercontent, xsc.enterattrs) if self.matchpath(path) else (xsc.entercontent, xsc.enterattrs)
44
45    def __div__(self, other):
46        """
47        Create a :class:`ChildCombinator` with :var:`self` as the left hand
48        selector and :var:`other` as the right hand selector.
49        """
50        return ChildCombinator(self, xsc.makewalkfilter(other))
51
52    def __floordiv__(self, other):
53        """
54        Create a :class:`DescendantCombinator` with :var:`self` as the left hand
55        selector and :var:`other` as the right hand selector.
56        """
57        return DescendantCombinator(self, xsc.makewalkfilter(other))
58
59    def __mul__(self, other):
60        """
61        Create an :class:`AdjacentSiblingCombinator` with :var:`self` as the left
62        hand selector and :var:`other` as the right hand selector.
63        """
64        return AdjacentSiblingCombinator(self, xsc.makewalkfilter(other))
65
66    def __pow__(self, other):
67        """
68        Create a :class:`GeneralSiblingCombinator` with :var:`self` as the left
69        hand selector and :var:`other` as the right hand selector.
70        """
71        return GeneralSiblingCombinator(self, xsc.makewalkfilter(other))
72
73    def __and__(self, other):
74        """
75        Create an :class:`AndCombinator` from :var:`self` and :var:`other`.
76        """
77        return AndCombinator(self, xsc.makewalkfilter(other))
78
79    def __or__(self, other):
80        """
81        Create an :class:`OrCombinator` from :var:`self` and :var:`other`.
82        """
83        return OrCombinator(self, xsc.makewalkfilter(other))
84
85    def __invert__(self):
86        """
87        Create a :class:`NotCombinator` inverting :var:`self`.
88        """
89        return NotCombinator(self)
90
91
92class IsInstanceSelector(Selector):
93    """
94    Selector that selects all nodes that are instances of the specified type.
95    You can either create an :class:`IsInstanceSelector` object directly
96    or simply pass a class to a function that expects a walk filter (this class
97    will be automatically wrapped in an :class:`IsInstanceSelector`)::
98
99        >>> from ll.xist import parsers, xfind
100        >>> from ll.xist.ns import html
101        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
102        >>> for node in doc.walknode(html.a):
103        ...     print node.bytes()
104        ...
105        <a id="logolink" accesskey="1" href="http://www.python.org/"><img src="http://www.python.org/images/python-logo.gif" id="logo" border="0" alt="homepage" /></a>
106        <a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" src="http://www.python.org/images/trans.gif" border="0" alt="skip to navigation" /></a>
107        <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" src="http://www.python.org/images/trans.gif" border="0" alt="skip to content" /></a>
108        <a class="reference" href="http://www.python.org/search">Advanced Search</a>
109        <a title="About The Python Language" href="http://www.python.org/about/">About</a>
110        ...
111    """
112    def __init__(self, *types):
113        self.types = types
114
115    def matchpath(self, path):
116        if path:
117            return isinstance(path[-1], self.types)
118        return False
119
120    def __or__(self, other):
121        # If other is a type check too, combine self and other into one isinstance instance
122        if isinstance(other, xsc._Node_Meta):
123            return IsInstanceSelector(*(self.types + (other,)))
124        elif isinstance(other, IsInstanceSelector):
125            return IsInstanceSelector(*(self.types+other.types))
126        return Selector.__or__(self, other)
127
128    def __getitem__(self, index):
129        """
130        Return an :class:`nthoftype` selector that uses :var:`index` as the
131        index and ``self.types`` as the types.
132        """
133        return nthoftype(index, *self.types)
134
135    def __str__(self):
136        if len(self.types) == 1:
137            return "%s.%s" % (self.types[0].__module__, self.types[0].__name__)
138        else:
139            return "(%s)" % " | ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types)
140
141
142class hasname(Selector):
143    """
144    Selector that selects all nodes that have a specified Python name (which
145    only selects elements, processing instructions and entities). Also a namespace
146    name can be specified as a second argument, which will only select elements
147    from the specified namespace::
148
149        >>> from ll.xist import parsers, xfind
150        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
151        >>> for node in doc.walknode(xfind.hasname("img")):
152        ...     print node.bytes()
153        ...
154        <img border="0" src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" />
155        <img border="0" id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" />
156        <img border="0" id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" />
157        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
158    """
159    def __init__(self, name, xmlns=None):
160        self.name = name
161        self.xmlns = xsc.nsname(xmlns)
162
163    def matchpath(self, path):
164        if path:
165            node = path[-1]
166            if self.xmlns is not None:
167                return isinstance(node, xsc.Element) and node.__class__.__name__ == self.name and node.xmlns == self.xmlns
168            else:
169                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.__class__.__name__ == self.name
170        return False
171
172    def __str__(self):
173        return "%s(%r)" % (self.__class__.__name__, self.name)
174
175
176class hasname_xml(Selector):
177    """
178    :class:`hasname_xml` works similar to :class:`hasname` except that the
179    specified name is treated as the XML name, not the Python name.
180    """
181    def __init__(self, name, xmlns=None):
182        self.name = name
183        self.xmlns = xsc.nsname(xmlns)
184
185    def matchpath(self, path):
186        if path:
187            node = path[-1]
188            if self.xmlns is not None:
189                return isinstance(node, xsc.Element) and node.xmlname == self.name and node.xmlns == self.xmlns
190            else:
191                return isinstance(node, (xsc.Element, xsc.ProcInst, xsc.Entity)) and node.xmlname == self.name
192        return False
193
194    def __str__(self):
195        return "%s(%r)" % (self.__class__.__name__, self.name)
196
197
198class IsSelector(Selector):
199    """
200    Selector that selects one specific node in the tree. This can be combined
201    with other selectors via :class:`ChildCombinator` or
202    :class:`DescendantCombinator` selectors to select children of this specific
203    node. You can either create an :class:`IsSelector` directly or simply pass
204    a node to a function that expects a walk filter::
205
206        >>> from ll.xist import parsers, xfind
207        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
208        >>> for node in doc.walknode(doc[0]/xsc.Element):
209        ...     print repr(node)
210        ...
211        <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6c82f4c>
212        <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6c3154c>
213    """
214    def __init__(self, node):
215        self.node = node
216
217    def matchpath(self, path):
218        return path and path[-1] is self.node
219
220    def __str__(self):
221        return "%s(%r)" % (self.__class__.__name__, self.node)
222
223
224class isroot(Selector):
225    def matchpath(self, path):
226        return len(path) == 1
227
228    def __str__(self):
229        return "isroot"
230
231
232isroot = isroot()
233
234
235class empty(Selector):
236    """
237    Selector that selects all empty elements or fragments::
238
239        >>> from ll.xist import parsers, xfind
240        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
241        >>> for node in doc.walknode(xfind.empty):
242        ...     print node.bytes()
243        ...
244        <meta content="text/html; charset=utf-8" http-equiv="content-type" />
245        <meta content="python programming language object oriented web free source" name="keywords" />
246        <meta content="      Home page for Python, an interpreted, interactive, object-oriented, extensible
247              programming language. It provides an extraordinary combination of clarity and
248              versatility, and is free and comprehensively ported. " name="description" />
249        <a type="application/rss+xml" href="http://www.python.org/channews.rdf" rel="alternate" title="RSS" />
250        ...
251    """
252
253    def matchpath(self, path):
254        if path:
255            node = path[-1]
256            if isinstance(node, (xsc.Element, xsc.Frag)):
257                return len(node) == 0
258        return False
259
260    def __str__(self):
261        return "empty"
262
263
264empty = empty()
265
266
267class onlychild(Selector):
268    """
269    Selector that selects all node that are the only child of their parents::
270
271        >>> from ll.xist import parsers, xfind
272        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
273        >>> for node in doc.walknode(<em>xfind.onlychild & html.a</em>):
274        ...     print node.bytes()
275        ...
276        <a accesskey="2" href="http://www.python.org/#left%2dhand%2dnavigation"><img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" /></a>
277        <a accesskey="3" href="http://www.python.org/#content%2dbody"><img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" /></a>
278        <a href="http://www.python.org/download/releases/2.5.1">Quick Links (2.5.1)</a>
279        <a title="Manuals for Latest Stable Release" href="http://docs.python.org/">Documentation</a>
280        ...
281    """
282
283    def matchpath(self, path):
284        if len(path) >= 2:
285            parent = path[-2]
286            if isinstance(parent, (xsc.Frag, xsc.Element)):
287                return len(parent)==1 and parent[0] is path[-1]
288        return False
289
290    def __str__(self):
291        return "onlychild"
292
293
294onlychild = onlychild()
295
296
297class onlyoftype(Selector):
298    """
299    Selector that selects all nodes that are the only nodes of their type among
300    their siblings::
301
302        >>> from ll.xist import parsers, xfind
303        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
304        >>> for node in doc.walknode(xfind.onlyoftype & xsc.Element):
305        ...     print repr(node)
306        ...
307        <ll.xist.ns.html.html element object (2 children/1 attr) (from http://www.python.org/:4:?) at 0xb6d6e7ec>
308        <ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6cc1f8c>
309        <ll.xist.ns.html.title element object (1 child/no attrs) (from http://www.python.org/:8:?) at 0xb6d79b8c>
310        <ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6d7282c>
311        ...
312    """
313
314    def matchpath(self, path):
315        if len(path) >= 2:
316            node = path[-1]
317            parent = path[-2]
318            if isinstance(parent, (xsc.Frag, xsc.Element)):
319                for child in parent:
320                    if isinstance(child, node.__class__):
321                        if child is not node:
322                            return False
323                return True
324        return False
325
326    def __str__(self):
327        return "onlyoftype"
328
329
330onlyoftype = onlyoftype()
331
332
333class hasattr(Selector):
334    """
335    Selector that selects all element nodes that have an attribute with one
336    of the specified Python names. For selecting nodes with global attributes
337    the attribute class can be passed::
338
339        >>> from ll.xist import parsers, xfind
340        >>> from ll.xist.ns import html, xml
341        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
342        >>> for node in doc.walknode(xfind.hasattr(xml.Attrs.lang)):
343        ...     print repr(node)
344        ...
345        <ll.xist.ns.html.html element object (2 children/2 attrs) (from http://www.python.org/:4:?) at 0xb6d71d4c>
346    """
347
348    def __init__(self, *attrnames):
349        self.attrnames = attrnames
350
351    def matchpath(self, path):
352        if path:
353            node = path[-1]
354            if isinstance(node, xsc.Element):
355                for attrname in self.attrnames:
356                    if node.Attrs.isallowed(attrname) and node.attrs.has(attrname):
357                        return True
358        return False
359
360    def __str__(self):
361        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
362
363
364class hasattr_xml(Selector):
365    """
366    :class:`hasattr_xml` works similar to :class:`hasattr` except that the
367    specified names are treated as XML names instead of Python names.
368    """
369
370    def __init__(self, *attrnames):
371        self.attrnames = attrnames
372
373    def matchpath(self, path):
374        if path:
375            node = path[-1]
376            if isinstance(node, xsc.Element):
377                for attrname in self.attrnames:
378                    if node.Attrs.isallowed_xml(attrname) and node.attrs.has_xml(attrname):
379                        return True
380        return False
381
382    def __str__(self):
383        return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(attrname) for attrname in self.attrnames))
384
385
386class attrhasvalue(Selector):
387    """
388    Selector that selects all element nodes where an attribute with the
389    specified Python name has the specified value. For global attributes the
390    attribute class can be passed. Note that "fancy" attributes (i.e. those
391    containing non-text) will not be considered::
392
393        >>> from ll.xist import parsers, xfind
394        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
395        >>> for node in doc.walknode(xfind.attrhasvalue("rel", "stylesheet")):
396        ...     print repr(node)
397        ...
398        <a media="screen" type="text/css" href="http://www.python.org/styles/screen-switcher-default.css" rel="stylesheet" id="screen-switcher-stylesheet" />
399        <a media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
400        <a media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />
401    """
402
403    def __init__(self, attrname, attrvalue):
404        self.attrname = attrname
405        self.attrvalue = attrvalue
406
407    def matchpath(self, path):
408        if path:
409            node = path[-1]
410            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
411                attr = node.attrs.get(self.attrname)
412                if not attr.isfancy(): # if there are PIs, say no
413                    return unicode(attr) == self.attrvalue
414        return False
415
416    def __str__(self):
417        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
418
419
420class attrhasvalue_xml(Selector):
421    """
422    :class:`attrhasvalue_xml` works similar to :class:`attrhasvalue` except that
423    the specified name is treated as an XML name instead of a Python name.
424    """
425
426    def __init__(self, attrname, attrvalue):
427        self.attrname = attrname
428        self.attrvalue = attrvalue
429
430    def matchpath(self, path):
431        if path:
432            node = path[-1]
433            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
434                attr = node.attrs.get_xml(self.attrname)
435                if not attr.isfancy(): # if there are PIs, say no
436                    return unicode(attr) == self.attrvalue
437        return False
438
439    def __str__(self):
440        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
441
442
443class attrcontains(Selector):
444    """
445    Selector that selects all element nodes where an attribute with the
446    specified Python name contains the specified substring in its value. For
447    global attributes the attribute class can be passed. Note that "fancy"
448    attributes (i.e. those containing non-text) will not be considered::
449
450        >>> from ll.xist import parsers, xfind
451        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
452        >>> for node in doc.walknode(xfind.attrcontains("rel", "stylesheet")):
453        ... \tprint repr(node)
454        ...
455        <a type="text/css" id="screen-switcher-stylesheet" media="screen" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
456        <a type="text/css" media="scReen" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
457        <a type="text/css" media="print" rel="stylesheet" href="http://www.python.org/styles/print.css" />
458        <a type="text/css" title="large text" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
459        <a type="text/css" title="default fonts" media="screen" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />
460    """
461
462    def __init__(self, attrname, attrvalue):
463        self.attrname = attrname
464        self.attrvalue = attrvalue
465
466    def matchpath(self, path):
467        if path:
468            node = path[-1]
469            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
470                attr = node.attrs.get(self.attrname)
471                if not attr.isfancy(): # if there are PIs, say no
472                    return self.attrvalue in unicode(attr)
473        return False
474
475    def __str__(self):
476        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
477
478
479class attrcontains_xml(Selector):
480    """
481    :class:`attrcontains_xml` works similar to :class:`attrcontains` except that
482    the specified name is treated as an XML name instead of a Python name.
483    """
484
485    def __init__(self, attrname, attrvalue):
486        self.attrname = attrname
487        self.attrvalue = attrvalue
488
489    def matchpath(self, path):
490        if path:
491            node = path[-1]
492            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
493                attr = node.attrs.get_xml(self.attrname)
494                if not attr.isfancy(): # if there are PIs, say no
495                    return self.attrvalue in unicode(attr)
496        return False
497
498    def __str__(self):
499        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
500
501
502class attrstartswith(Selector):
503    """
504    Selector that selects all element nodes where an attribute with the
505    specified Python name starts with the specified string. For global
506    attributes the attribute class can be passed. Note that "fancy" attributes
507    (i.e. those containing non-text) will not be considered::
508
509        >>> from ll.xist import parsers, xfind
510        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
511        >>> for node in doc.walknode(xfind.attrstartswith("class_", "input-")):
512        ...     print repr(node)
513        ...
514        <input class="input-text" id="q" type="text" name="q" />
515        <input value="search" class="input-button" id="submit" type="submit" name="submit" />
516    """
517
518    def __init__(self, attrname, attrvalue):
519        self.attrname = attrname
520        self.attrvalue = attrvalue
521
522    def matchpath(self, path):
523        if path:
524            node = path[-1]
525            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
526                attr = node.attrs.get(self.attrname)
527                if not attr.isfancy(): # if there are PIs, say no
528                    return unicode(attr).startswith(self.attrvalue)
529        return False
530
531    def __str__(self):
532        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
533
534
535class attrstartswith_xml(Selector):
536    """
537    :class:`attrstartswith_xml` works similar to :class:`attrstartswith` except
538    that the specified name is treated as an XML name instead of a Python name.
539    """
540
541    def __init__(self, attrname, attrvalue):
542        self.attrname = attrname
543        self.attrvalue = attrvalue
544
545    def matchpath(self, path):
546        if path:
547            node = path[-1]
548            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
549                attr = node.attrs.get_xml(self.attrname)
550                if not attr.isfancy(): # if there are PIs, say no
551                    return unicode(attr).startswith(self.attrvalue)
552        return False
553
554    def __str__(self):
555        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
556
557
558class attrendswith(Selector):
559    """
560    Selector that selects all element nodes where an attribute with the
561    specified Python name ends with the specified string. For global attributes
562    the attribute class can be passed. Note that "fancy" attributes (i.e. those
563    containing non-text) will not be considered::
564
565        >>> from ll.xist import parsers, xfind
566        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
567        >>> for node in doc.walknode(xfind.attrendswith("href", ".css")):
568        ...     print repr(node)
569        ...
570        <a href="http://www.python.org/styles/screen-switcher-default.css" type="text/css" rel="stylesheet" id="screen-switcher-stylesheet" media="screen" />
571        <a type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" media="scReen" />
572        <a type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" media="print" />
573        <a title="large text" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" media="screen" />
574        <a title="default fonts" type="text/css" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" media="screen" />
575    """
576
577    def __init__(self, attrname, attrvalue):
578        self.attrname = attrname
579        self.attrvalue = attrvalue
580
581    def matchpath(self, path):
582        if path:
583            node = path[-1]
584            if isinstance(node, xsc.Element) and node.Attrs.isallowed(self.attrname):
585                attr = node.attrs.get(self.attrname)
586                if not attr.isfancy(): # if there are PIs, say no
587                    return unicode(attr).endswith(self.attrvalue)
588        return False
589
590    def __str__(self):
591        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
592
593
594class attrendswith_xml(Selector):
595    """
596    :class:`attrendswith_xml` works similar to :class:`attrendswith` except that
597    the specified name is treated as an XML name instead of a Python name.
598    """
599
600    def __init__(self, attrname, attrvalue):
601        self.attrname = attrname
602        self.attrvalue = attrvalue
603
604    def matchpath(self, path):
605        if path:
606            node = path[-1]
607            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml(self.attrname):
608                attr = node.attrs.get_xml(self.attrname)
609                if not attr.isfancy(): # if there are PIs, say no
610                    return unicode(attr).endswith(self.attrvalue)
611        return False
612
613    def __str__(self):
614        return "%s(%r, %r)" % (self.__class__.__name__, self.attrname, self.attrvalue)
615
616
617class hasid(Selector):
618    """
619    Selector that selects all element nodes where the ``id`` attribute has the
620    specified value::
621
622        >>> from ll.xist import parsers, xfind
623        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
624        >>> for node in doc.walknode(xfind.hasid("logo")):
625        ...     print node.bytes()
626        ...
627        <img src="http://www.python.org/images/python-logo.gif" id="logo" alt="homepage" border="0" />
628    """
629
630    def __init__(self, id):
631        self.id = id
632
633    def matchpath(self, path):
634        if path:
635            node = path[-1]
636            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("id"):
637                attr = node.attrs.get_xml("id")
638                if not attr.isfancy():
639                    return unicode(attr) == self.id
640        return False
641
642    def __str__(self):
643        return "%s(%r)" % (self.__class__.__name__, self.id)
644
645
646class hasclass(Selector):
647    """
648    Selector that selects all element nodes where the ``class`` attribute has
649    the specified value::
650
651        >>> from ll.xist import parsers, xfind
652        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
653        >>> for node in doc.walknode(xfind.hasclass("reference")<):
654        ...     print node.bytes()
655        ...
656        <a class="reference" href="http://www.python.org/search">Advanced Search</a>
657        <a href="http://www.python.org/about/success/rackspace" class="reference">Rackspace</a>
658        <a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
659        <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
660        ...
661    """
662
663    def __init__(self, classname):
664        self.classname = classname
665
666    def matchpath(self, path):
667        if path:
668            node = path[-1]
669            if isinstance(node, xsc.Element) and node.Attrs.isallowed_xml("class"):
670                attr = node.attrs.get_xml("class")
671                if not attr.isfancy():
672                    return self.classname in unicode(attr).split()
673        return False
674
675    def __str__(self):
676        return "%s(%r)" % (self.__class__.__name__, self.classname)
677
678
679class inattr(Selector):
680    """
681    Selector that selects all attribute nodes and nodes inside of attributes::
682
683    >>> from ll.xist import parsers, xfind
684    >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
685    >>> for node in doc.walknode(xfind.inattr & xsc.Text):
686    ...     print node.bytes()
687    ...
688    text/html; charset=utf-8
689    content-type
690    python programming language object oriented web free source
691    ...
692    """
693    def matchpath(self, path):
694        return any(isinstance(node, xsc.Attr) for node in path)
695
696    def __str__(self):
697        return "inattr"
698
699
700inattr = inattr()
701
702
703class Combinator(Selector):
704    """
705    <p>A :class:`Combinator` is a selector that transforms one or combines
706    two or more other selectors in a certain way.</p>
707    """
708
709
710class BinaryCombinator(Combinator):
711    """
712    A :class:`BinaryCombinator` is a combinator that combines two selector:
713    the left hand selector and the right hand selector.
714    """
715    symbol = None
716
717    def __init__(self, left, right):
718        self.left = left
719        self.right = right
720
721    def __str__(self):
722        left = str(self.left)
723        if isinstance(self.left, Combinator) and not isinstance(self.left, self.__class__):
724            left = "(%s)" % left
725        right = str(self.right)
726        if isinstance(self.right, Combinator) and not isinstance(self.right, self.__class__):
727            right = "(%s)" % right
728        return "%s%s%s" % (left, self.symbol, right)
729
730
731class ChildCombinator(BinaryCombinator):
732    """
733    A :class:`ChildCombinator` is a :class:`BinaryCombinator`. To match the
734    :class:`ChildCombinator` the node must match the right hand selector and
735    it's immediate parent must match the left hand selector (i.e. it works
736    similar to the ``>`` combinator in CSS or the ``/`` combinator in XPath).
737
738    :class:`ChildCombinator` objects can be created via the division operator
739    (``/``)::
740
741        >>> from ll.xist import parsers, xfind
742        >>> from ll.xist.ns import html
743        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
744        >>> for node in doc.walknode(html.a/html.img):
745        ...     print node.bytes()
746        ...
747        <img src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" border="0" />
748        <img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
749        <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
750        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
751    """
752    def matchpath(self, path):
753        if path and self.right.matchpath(path):
754            return self.left.matchpath(path[:-1])
755        return False
756
757    symbol = " / "
758
759
760class DescendantCombinator(BinaryCombinator):
761    """
762    A :class:`DescendantCombinator` is a :class:`BinaryCombinator`. To match the
763    :class:`DescendantCombinator` the node must match the right hand selector
764    and any of it's ancestor nodes must match the left hand selector (i.e. it
765    works similar to the descendant combinator in CSS or the ``//`` combinator
766    in XPath).
767
768    :class:`DescendantCombinator` objects can be created via the floor division
769    operator (``//``)::
770
771        >>> from ll.xist import parsers, xfind
772        >>> from ll.xist.ns import html
773        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
774        >>> for node in doc.walknode(html.div//html.img):
775        ...     print node.bytes()
776        ...
777        <img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
778        <img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
779        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
780    """
781    def matchpath(self, path):
782        if path and self.right.matchpath(path):
783            while path:
784                path = path[:-1]
785                if self.left.matchpath(path):
786                    return True
787        return False
788
789    symbol = " // "
790
791
792class AdjacentSiblingCombinator(BinaryCombinator):
793    """
794    A :class:`AdjacentSiblingCombinator` is a :class:`BinaryCombinator`.
795    To match the :class:`AdjacentSiblingCombinator` the node must match the
796    right hand selector and the immediately preceding sibling must match the
797    left hand selector.
798
799    :class:`AdjacentSiblingCombinator` objects can be created via the
800    multiplication operator (``*``). The following example outputs all links
801    inside those :class:`p` elements that immediately follow a :class:`h2`
802    element::
803
804        >>> from ll.xist import parsers, xfind
805        >>> from ll.xist.ns import html
806        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
807        >>> for node in doc.walknode(html.h2*html.p/html.a):
808        ...     print node.bytes()
809        ...
810        <a href="http://www.scipy.org/SciPy2007" class="reference">SciPy Conference</a>
811        <a href="https://www.enthought.com/scipy07/" class="reference">early registration</a>
812        <a href="http://www.europython.org/sections/registration_issues/how-to-register" class="reference">Online registration</a>
813        <a href="http://europython.org/" class="reference">EuroPython 2007</a>
814        <a href="http://www.osdc.com.au/papers/cfp.html" class="reference">Call For Papers</a>
815        <a href="http://www.swa.hpi.uni-potsdam.de/dls07/" class="reference">DLS 2007</a>
816        <a href="http://pythonpapers.cgpublisher.com/" class="reference">The Python Papers</a>
817        <a href="http://www.pyconuk.org/" class="reference">PyCon UK</a>
818        <a href="http://www.pyconuk.org/submit.html" class="reference">proposals for talks</a>
819        <a href="http://www.pycon.it/registration/" class="reference">registration online</a>
820    """
821
822    def matchpath(self, path):
823        if len(path) >= 2 and self.right.matchpath(path):
824            # Find sibling
825            node = path[-1]
826            sibling = None
827            for child in path[-2]:
828                if child is node:
829                    break
830                sibling = child
831            if sibling is not None:
832                return self.left.matchpath(path[:-1]+[sibling])
833        return False
834
835    symbol = " * "
836
837
838class GeneralSiblingCombinator(BinaryCombinator):
839    """
840    A :class:`GeneralSiblingCombinator` is a :class:`BinaryCombinator`.
841    To match the :class:`GeneralSiblingCombinator` the node must match the
842    right hand selector and any of the preceding siblings must match the left
843    hand selector.
844
845    :class:`AdjacentSiblingCombinator` objects can be created via the
846    exponentiation operator (``**``). The following example outputs all links
847    that are not the first links inside their parent (i.e. they have another
848    link among their preceding siblings)::
849
850        >>> from ll.xist import parsers, xfind
851        >>> from ll.xist.ns import html
852        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
853        >>> for node in doc.walknode(html.a**html.a):
854        ...     print node.bytes()
855        ...
856        <a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
857        <a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
858        <a href="http://www.python.org/about/success/honeywell" class="reference">Honeywell</a>
859        <a href="http://www.python.org/about/success" class="reference">and many others</a>
860        <a href="http://www.zope.org/">Zope</a>
861        ...
862    """
863
864    def matchpath(self, path):
865        if len(path) >= 2 and self.right.matchpath(path):
866            node = path[-1]
867            for child in path[-2]:
868                if child is node: # no previous siblings
869                    return False
870                if self.left.matchpath(path[:-1]+[child]):
871                    return True
872        return False
873
874    symbol = " ** "
875
876
877class ChainedCombinator(Combinator):
878    """
879    A :class:`ChainedCombinator` combines any number of other selectors.
880    """
881
882    symbol = None
883
884    def __init__(self, *selectors):
885        self.selectors = selectors
886
887    def __str__(self):
888        v = []
889        for selector in self.selectors:
890            s = str(selector)
891            if isinstance(selector, Combinator) and not isinstance(selector, self.__class__):
892                s = "(%s)" % s
893            v.append(s)
894        return self.symbol.join(v)
895
896
897class OrCombinator(ChainedCombinator):
898    """
899    An :class:`OrCombinator` is a :class:`ChainedCombinator` where the node must
900    match at least one of the selectors to match the :class:`OrCombinator`. An
901    :class:`OrCombinator` can be created with the binary or operator (``|``)::
902
903        >>> from ll.xist import parsers, xfind
904        >>> from ll.xist.ns import html
905        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
906        >>> for node in doc.walknode(xfind.hasattr("href") | xfind.hasattr("src")):
907        ...     print node.bytes()
908        ...
909        <a type="application/rss+xml" title="RSS" rel="alternate" href="http://www.python.org/channews.rdf" />
910        <a media="screen" type="text/css" id="screen-switcher-stylesheet" rel="stylesheet" href="http://www.python.org/styles/screen-switcher-default.css" />
911        <a media="scReen" type="text/css" rel="stylesheet" href="http://www.python.org/styles/netscape4.css" />
912        <a media="print" type="text/css" rel="stylesheet" href="http://www.python.org/styles/print.css" />
913        <a media="screen" type="text/css" title="large text" rel="alternate stylesheet" href="http://www.python.org/styles/largestyles.css" />
914        <a media="screen" type="text/css" title="default fonts" rel="alternate stylesheet" href="http://www.python.org/styles/defaultfonts.css" />
915        <script src="http://www.python.org/js/iotbs2-key-directors-load.js" type="text/javascript"></script>
916        <script src="http://www.python.org/js/iotbs2-directors.js" type="text/javascript"></script>
917        <script src="http://www.python.org/js/iotbs2-core.js" type="text/javascript"></script>
918        <a accesskey="1" id="logolink" href="http://www.python.org/"><img alt="homepage" src="http://www.python.org/images/python-logo.gif" id="logo" border="0" /></a>
919        ...
920    """
921
922    def matchpath(self, path):
923        return any(selector.matchpath(path) for selector in self.selectors)
924
925    symbol = " | "
926
927    def __or__(self, other):
928        return OrCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
929
930
931class AndCombinator(ChainedCombinator):
932    """
933    An :class:`AndCombinator` is a :class:`ChainedCombinator` where the node
934    must match all of the combined selectors to match the :class:`AndCombinator`.
935    An :class:`AndCombinator` can be created with the binary and operator
936    (``&``)::
937
938        >>> from ll.xist import parsers, xfind
939        >>> from ll.xist.ns import html
940        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
941        >>> for node in doc.walknode(html.input & xfind.hasattr("id")):
942        ...     print node.bytes()
943        ...
944        <input id="domains" name="domains" value="www.python.org" type="hidden" />
945        <input id="sitesearch" name="sitesearch" value="www.python.org" type="hidden" />
946        <input id="sourceid" name="sourceid" value="google-search" type="hidden" />
947        <input id="q" class="input-text" name="q" type="text" />
948        <input id="submit" value="search" name="submit" type="submit" class="input-button" />
949    """
950
951    def matchpath(self, path):
952        return all(selector.matchpath(path) for selector in self.selectors)
953
954    def __and__(self, other):
955        return AndCombinator(*(self.selectors + (xsc.makewalkfilter(other),)))
956
957    symbol = " & "
958
959
960class NotCombinator(Combinator):
961    """
962    A :class:`NotCombinator` inverts the selection logic of the underlying
963    selector, i.e. a node matches only if it does not match the underlying
964    selector. A :class:`NotCombinator` can be created with the unary inversion
965    operator (``~``).
966
967    The following example outputs all images that don't have a ``border``
968    attribute::
969
970        >>> from ll.xist import parsers, xfind
971        >>> from ll.xist.ns import html
972        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
973        >>> for node in doc.walknode(html.img & ~xfind.hasattr("border")):
974        ...     print node.bytes()
975        ...
976        <img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />
977    """
978
979    def __init__(self, selector):
980        self.selector = selector
981
982    def matchpath(self, path):
983        return not self.selector.matchpath(path)
984
985    def __str__(self):
986        if isinstance(self.selector, Combinator) and not isinstance(self.selector, NotCombinator):
987            return "~(%s)" % self.selector
988        else:
989            return "~%s" % self.selector
990
991
992class CallableSelector(Selector):
993    """
994    A :class:`CallableSelector` is a selector that calls a user specified
995    callable to select nodes. The callable gets passed the path and must return
996    a bool specifying whether this path is selected. A :class:`CallableSelector`
997    is created implicitely whenever a callable is passed to a method that
998    expects a walk filter.
999
1000    The following example outputs all links that point outside the ``python.org``
1001    domain::
1002
1003        >>> from ll.xist import parsers, xfind
1004        >>> from ll.xist.ns import html
1005        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
1006        >>> def foreignlink(path):
1007        ...     return path and isinstance(path[-1], html.a) and not path[-1].attrs.href.asURL().server.endswith(".python.org")
1008        ...
1009        >>> for node in doc.walknode(foreignlink):
1010        ...     print node.bytes()
1011        ...
1012        <a href="http://youtube.com/" class="reference">YouTube.com</a>
1013        <a href="http://www.zope.org/">Zope</a>
1014        <a href="http://www.djangoproject.com/">Django</a>
1015        <a href="http://www.turbogears.org/">TurboGears</a>
1016        <a href="http://pyxml.sourceforge.net/topics/">XML</a>
1017        ..
1018    """
1019
1020    def __init__(self, func):
1021        self.func = func
1022
1023    def matchpath(self, path):
1024        return self.func(path)
1025
1026    def __str__(self):
1027        return "%s(%r)" % (self.__class__.__name__, self.func)
1028
1029
1030class nthchild(Selector):
1031    """
1032    An :class:`nthchild` object is a selector that selects every node that is
1033    the n-th child of its parent. E.g. ``nthchild(0)`` selects every first
1034    child, ``nthchild(-1)`` selects each last child. Furthermore
1035    ``nthchild("even")`` selects each first, third, fifth, ... child and
1036    ``nthchild("odd")`` selects each second, fourth, sixth, ... child.
1037    """
1038
1039    def __init__(self, index):
1040        self.index = index
1041
1042    def matchpath(self, path):
1043        if len(path) >= 2:
1044            if self.index in ("even", "odd"):
1045                for (i, child) in enumerate(path[-2]):
1046                    if child is path[-1]:
1047                        return (i % 2) == (self.index == "odd")
1048            else:
1049                try:
1050                    return path[-2][self.index] is path[-1]
1051                except IndexError:
1052                    return False
1053        return False
1054
1055    def __str__(self):
1056        return "%s(%r)" % (self.__class__.__name__, self.index)
1057
1058
1059class nthoftype(Selector):
1060    """
1061    An :class:`nthchild` object is a selector that selects every node that is
1062    the n-th node of a specified type among its siblings. Similar to
1063    :class:`nthchild` :class:`nthoftype` supports negative and positive indices
1064    as well as ``"even"`` and ``"odd"``. Which types are checked can be passed
1065    explicitely. If no types are passed the type of the node itself is used::
1066
1067        >>> from ll.xist import parsers, xfind
1068        >>> from ll.xist.ns import html
1069        >>> doc = parsers.parseurl("http://www.python.org", tidy=True)
1070        >>> for node in doc.walknode(xfind.nthoftype(0, html.h2)):
1071        ...     print node.bytes()
1072        ...
1073        <h2 class="news">SciPy 2007 - Conference for Scientific Computing</h2>
1074    """
1075
1076    def __init__(self, index, *types):
1077        self.index = index
1078        self.types = types
1079
1080    def _find(self, path):
1081        types = self.types if self.types else path[-1].__class__
1082        for child in path[-2]:
1083            if isinstance(child, types):
1084                yield child
1085
1086    def matchpath(self, path):
1087        if len(path) >= 2:
1088            if self.index in ("even", "odd"):
1089                for (i, child) in enumerate(self._find(path)):
1090                    if child is path[-1]:
1091                        return (i % 2) == (self.index == "odd")
1092            else:
1093                try:
1094                    return misc.item(self._find(path), self.index) is path[-1]
1095                except IndexError:
1096                    return False
1097        return False
1098
1099    def __str__(self):
1100        if self.types:
1101            return "%s(%r, %s)" % (self.__class__.__name__, self.index, ", ".join("%s.%s" % (type.__module__, type.__name__) for type in self.types))
1102        else:
1103            return "%s(%r)" % (self.__class__.__name__, self.index)
Note: See TracBrowser for help on using the browser.