root/livinglogic.python.xist/src/ll/xist/scripts/xml2xsc.py @ 4437:6f3d4e845072

Revision 4437:6f3d4e845072, 6.8 KB (checked in by Walter Doerwald <walter@…>, 8 years ago)

Fix typos in script documentation. Add examples. Bump version number.

RevLine 
[2522]1#!/usr/bin/env python
[2903]2# -*- coding: utf-8 -*-
[2522]3
[4422]4## Copyright 1999-2011 by LivingLogic AG, Bayreuth/Germany
5## Copyright 1999-2011 by Walter Dörwald
[2789]6##
7## All Rights Reserved
8##
[3263]9## See ll/__init__.py for the license
[2789]10
[2522]11
[4423]12"""
[4437]13Purpose
14-------
15
[4423]16``xml2xsc`` is a script that generates an XIST namespace module from one or more
[4432]17XML files. ``xml2xsc`` will output an XIST element class for each element it
18encounters in any of the XML files. The attributes and model information
19``xml2xsc`` assigns to an element will be collected from each occurence of the
20element in the XML files, so the XML files should cover as many different cases
21as possible.
22
[4437]23
24Options
25-------
26
[4432]27``xml2xsc`` supports the following options:
28
29    ``urls``
[4433]30        Zero or more URLs (or filenames) of XML files to be parsed. If no URL is
31        given stdin will be read.
[4432]32
33    ``-p``, ``--parser`` : ``etree`` or ``lxml``
34        Which XML parser should be used from parsing the XML files? (``etree`` is
35        the default, ``lxml`` requires that lxml_ is installed)
36
37    ``-s``, ``--shareattrs`` : ``none``, ``dupes``, ``all``
38        Should attributes be shared among the elements? ``none`` means that each
39        element will have its own standalone :class:`Attrs` class directly derived
40        from :class:`ll.xist.Elements.Attrs`. For ``dupes`` each attribute that is
41        used by more than one element will be moved into its own :class:`Attrs`
42        class. For ``all`` this will be done for all attributes.
43
44    ``-m``, ``--model`` : ``no``, ``simple``, ``fullall``, ``fullonce``
45        Add model information to the namespace. ``no`` doesn't add any model
46        information. ``simple`` only adds ``model = False`` or ``model = True``
47        (i.e. only the information whether the element must be empty or not).
48        ``fullall`` adds a :mod:`ll.xist.sims` model object to each element class.
49        ``fullonce`` adds full model information to, but reuses model objects for
50        elements which have the same model.
51
52    ``-x``, ``--defaultxmlns``
53        The default namespace name. All elements that don't belong to any
54        namespace will be assigned to this namespace.
55
56    .. _lxml: http://lxml.de/
[4437]57
58
59Example
60-------
61
62Suppose we have the following XML file (named ``foo.xml``)::
63
64    <x a="0"><x b="1"/><y/></x>
65
66Then we can generate a skeleton XIST namespace from it with the following command::
67
68    xml2xsc foo.xml -xhttp://xmlns.example.org/ -mfullonce
69
70The output will be::
71
72    # -*- coding: ascii -*-
73
74
75    from ll.xist import xsc, sims
76
77
78    xmlns = 'http://xmlns.example.org/'
79
80
81    class x(xsc.Element):
82        xmlns = xmlns
83        class Attrs(xsc.Element.Attrs):
84            class a(xsc.TextAttr): pass
85            class b(xsc.TextAttr): pass
86
87
88    class y(xsc.Element): xmlns = xmlns
89
90
91    x.model = sims.Elements(y, x)
92    y.model = sims.Empty()
[4423]93"""
94
95
[4289]96import sys, argparse, cStringIO
[2522]97
[4289]98from ll import misc, url
[2522]99from ll.xist import xsc, xnd, sims
100
101
[3180]102__docformat__ = "reStructuredText"
[3109]103
104
[2979]105def iterpath(node):
106    yield [node]
107    if hasattr(node, "text") and node.text:
108        yield [node, node.text]
[2982]109    if hasattr(node, "getchildren"):
[2979]110        for child in node:
111            for path in iterpath(child):
112                yield [node] + path
113    if hasattr(node, "tail") and node.tail:
114        yield [node, node.tail]
[2522]115
116
[2979]117def getelementname(node):
118    xmlns = None
119    name = node.tag
120    if name.startswith("{"):
121        (xmlns, sep, name) = name[1:].partition("}")
122    return (name, xmlns)
[2522]123
124
[4289]125def addetree2xnd(ns, node, elements):
[4278]126    # Iterate through the tree and collect which elements are encountered and how they are nested
[2979]127    for path in iterpath(node):
128        node = path[-1]
129        if "Element" in type(node).__name__:
130            (name, xmlns) = getelementname(node)
[4289]131            if (name, xmlns) in ns.elements:
132                xndnode = ns.elements[(name, xmlns)]
133            else:
[2979]134                xndnode = xnd.Element(name, xmlns=xmlns)
[4295]135                ns += xndnode
[4289]136                elements[(name, xmlns)] = set()
[2979]137            for attrname in node.keys():
[4289]138                if not attrname.startswith("{") and attrname not in xndnode.attrs:
[4295]139                    xndnode += xnd.Attr(attrname, type=xsc.TextAttr)
[2979]140        elif "ProcessingInstruction" in type(node).__name__:
141            name = node.target
[4289]142            if name not in ns.procinsts:
[4295]143                ns += xnd.ProcInst(name)
[2979]144        elif "Comment" in type(node).__name__:
145            xndnode = "#comment"
146        elif isinstance(node, basestring):
147            if node.isspace():
148                xndnode = "#whitespace"
149            else:
150                xndnode = "#text"
151        if len(path) >= 2:
[2983]152            parent = path[-2]
153            if "Element" in type(parent).__name__:
154                parententry = elements[getelementname(parent)]
[4289]155                parententry.add(xndnode)
156
157
[4295]158def makexnd(urls, parser="etree", shareattrs="dupes", model="simple", defaultxmlns=None):
[4289]159    elements = {} # maps (name, xmlns) to content set
[4425]160    ns = xnd.Module(defaultxmlns=defaultxmlns, model=model)
[4289]161    with url.Context():
[4433]162        if not urls:
163            urls = [sys.stdin]
[4295]164        for u in urls:
165            if isinstance(u, url.URL):
166                u = u.openread()
167            elif isinstance(u, str):
168                u = cStringIO.StringIO(u)
[4289]169            if parser == "etree":
170                from xml.etree import cElementTree
[4295]171                node = cElementTree.parse(u).getroot()
[4289]172            elif parser == "lxml":
173                from lxml import etree
[4295]174                node = etree.parse(u).getroot()
[4289]175            else:
176                raise ValueError("unknown parser {!r}".format(parser))
177            addetree2xnd(ns, node, elements)
[2979]178
179    # Put sims info into the element definitions
[4113]180    if model == "none":
[2979]181        pass
[4113]182    elif model == "simple":
[4289]183        for (fullname, modelset) in elements.iteritems():
184            ns.elements[fullname].modeltype = bool(modelset)
185    elif model in ("fullall", "fullonce"):
186        for (fullname, modelset) in elements.iteritems():
187            element = ns.elements[fullname]
188            if not modelset:
189                element.modeltype = "sims.Empty"
[2979]190            else:
[4289]191                elements = [el for el in modelset if isinstance(el, xnd.Element)]
[2979]192                if not elements:
[4289]193                    if "#text" in modelset:
194                        element.modeltype = "sims.NoElements"
[2979]195                    else:
[4289]196                        element.modeltype = "sims.NoElementsOrText"
[2522]197                else:
[4289]198                    if "#text" in modelset:
199                        element.modeltype = "sims.ElementsOrText"
[2522]200                    else:
[4289]201                        element.modeltype = "sims.Elements"
202                    element.modelargs = elements
[2979]203    else:
[4114]204        raise ValueError("unknown sims mode {!r}".format(model))
[4289]205
206    if shareattrs=="dupes":
207        ns.shareattrs(False)
208    elif shareattrs=="all":
209        ns.shareattrs(True)
[2987]210    return ns
[2522]211
212
[3031]213def main(args=None):
[4289]214    p = argparse.ArgumentParser(description="Convert XML files to XIST namespace (on stdout)")
[4433]215    p.add_argument("urls", metavar="urls", type=url.URL, help="URLs of XML files to be parsed (default stdin)", nargs="*")
[4289]216    p.add_argument("-p", "--parser", dest="parser", help="parser module to use for XML parsing (default: %(default)s)", choices=("etree", "lxml"), default="etree")
217    p.add_argument("-s", "--shareattrs", dest="shareattrs", help="Should identical attributes be shared among elements? (default: %(default)s)", choices=("none", "dupes", "all"), default="dupes")
218    p.add_argument("-m", "--model", dest="model", help="Create sims info? (default: %(default)s)", choices=("none", "simple", "fullall", "fullonce"), default="simple")
[4295]219    p.add_argument("-x", "--defaultxmlns", dest="defaultxmlns", metavar="NAME", help="Force elements without a namespace into this namespace")
[2522]220
[4113]221    args = p.parse_args(args)
[4295]222    print makexnd(**args.__dict__)
[2522]223
224
225if __name__ == "__main__":
226    sys.exit(main())
Note: See TracBrowser for help on using the browser.