root/livinglogic.python.xist/src/ll/xist/scripts/xml2xsc.py @ 4437:6f3d4e845072

Revision 4437:6f3d4e845072, 6.8 KB (checked in by Walter Doerwald <walter@…>, 8 years ago)

Fix typos in script documentation. Add examples. Bump version number.

Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4## Copyright 1999-2011 by LivingLogic AG, Bayreuth/Germany
5## Copyright 1999-2011 by Walter Dörwald
6##
7## All Rights Reserved
8##
9## See ll/__init__.py for the license
10
11
12"""
13Purpose
14-------
15
16``xml2xsc`` is a script that generates an XIST namespace module from one or more
17XML files. ``xml2xsc`` will output an XIST element class for each element it
18encounters in any of the XML files. The attributes and model information
19``xml2xsc`` assigns to an element will be collected from each occurence of the
20element in the XML files, so the XML files should cover as many different cases
21as possible.
22
23
24Options
25-------
26
27``xml2xsc`` supports the following options:
28
29    ``urls``
30        Zero or more URLs (or filenames) of XML files to be parsed. If no URL is
31        given stdin will be read.
32
33    ``-p``, ``--parser`` : ``etree`` or ``lxml``
34        Which XML parser should be used from parsing the XML files? (``etree`` is
35        the default, ``lxml`` requires that lxml_ is installed)
36
37    ``-s``, ``--shareattrs`` : ``none``, ``dupes``, ``all``
38        Should attributes be shared among the elements? ``none`` means that each
39        element will have its own standalone :class:`Attrs` class directly derived
40        from :class:`ll.xist.Elements.Attrs`. For ``dupes`` each attribute that is
41        used by more than one element will be moved into its own :class:`Attrs`
42        class. For ``all`` this will be done for all attributes.
43
44    ``-m``, ``--model`` : ``no``, ``simple``, ``fullall``, ``fullonce``
45        Add model information to the namespace. ``no`` doesn't add any model
46        information. ``simple`` only adds ``model = False`` or ``model = True``
47        (i.e. only the information whether the element must be empty or not).
48        ``fullall`` adds a :mod:`ll.xist.sims` model object to each element class.
49        ``fullonce`` adds full model information to, but reuses model objects for
50        elements which have the same model.
51
52    ``-x``, ``--defaultxmlns``
53        The default namespace name. All elements that don't belong to any
54        namespace will be assigned to this namespace.
55
56    .. _lxml: http://lxml.de/
57
58
59Example
60-------
61
62Suppose we have the following XML file (named ``foo.xml``)::
63
64    <x a="0"><x b="1"/><y/></x>
65
66Then we can generate a skeleton XIST namespace from it with the following command::
67
68    xml2xsc foo.xml -xhttp://xmlns.example.org/ -mfullonce
69
70The output will be::
71
72    # -*- coding: ascii -*-
73
74
75    from ll.xist import xsc, sims
76
77
78    xmlns = 'http://xmlns.example.org/'
79
80
81    class x(xsc.Element):
82        xmlns = xmlns
83        class Attrs(xsc.Element.Attrs):
84            class a(xsc.TextAttr): pass
85            class b(xsc.TextAttr): pass
86
87
88    class y(xsc.Element): xmlns = xmlns
89
90
91    x.model = sims.Elements(y, x)
92    y.model = sims.Empty()
93"""
94
95
96import sys, argparse, cStringIO
97
98from ll import misc, url
99from ll.xist import xsc, xnd, sims
100
101
102__docformat__ = "reStructuredText"
103
104
105def iterpath(node):
106    yield [node]
107    if hasattr(node, "text") and node.text:
108        yield [node, node.text]
109    if hasattr(node, "getchildren"):
110        for child in node:
111            for path in iterpath(child):
112                yield [node] + path
113    if hasattr(node, "tail") and node.tail:
114        yield [node, node.tail]
115
116
117def getelementname(node):
118    xmlns = None
119    name = node.tag
120    if name.startswith("{"):
121        (xmlns, sep, name) = name[1:].partition("}")
122    return (name, xmlns)
123
124
125def addetree2xnd(ns, node, elements):
126    # Iterate through the tree and collect which elements are encountered and how they are nested
127    for path in iterpath(node):
128        node = path[-1]
129        if "Element" in type(node).__name__:
130            (name, xmlns) = getelementname(node)
131            if (name, xmlns) in ns.elements:
132                xndnode = ns.elements[(name, xmlns)]
133            else:
134                xndnode = xnd.Element(name, xmlns=xmlns)
135                ns += xndnode
136                elements[(name, xmlns)] = set()
137            for attrname in node.keys():
138                if not attrname.startswith("{") and attrname not in xndnode.attrs:
139                    xndnode += xnd.Attr(attrname, type=xsc.TextAttr)
140        elif "ProcessingInstruction" in type(node).__name__:
141            name = node.target
142            if name not in ns.procinsts:
143                ns += xnd.ProcInst(name)
144        elif "Comment" in type(node).__name__:
145            xndnode = "#comment"
146        elif isinstance(node, basestring):
147            if node.isspace():
148                xndnode = "#whitespace"
149            else:
150                xndnode = "#text"
151        if len(path) >= 2:
152            parent = path[-2]
153            if "Element" in type(parent).__name__:
154                parententry = elements[getelementname(parent)]
155                parententry.add(xndnode)
156
157
158def makexnd(urls, parser="etree", shareattrs="dupes", model="simple", defaultxmlns=None):
159    elements = {} # maps (name, xmlns) to content set
160    ns = xnd.Module(defaultxmlns=defaultxmlns, model=model)
161    with url.Context():
162        if not urls:
163            urls = [sys.stdin]
164        for u in urls:
165            if isinstance(u, url.URL):
166                u = u.openread()
167            elif isinstance(u, str):
168                u = cStringIO.StringIO(u)
169            if parser == "etree":
170                from xml.etree import cElementTree
171                node = cElementTree.parse(u).getroot()
172            elif parser == "lxml":
173                from lxml import etree
174                node = etree.parse(u).getroot()
175            else:
176                raise ValueError("unknown parser {!r}".format(parser))
177            addetree2xnd(ns, node, elements)
178
179    # Put sims info into the element definitions
180    if model == "none":
181        pass
182    elif model == "simple":
183        for (fullname, modelset) in elements.iteritems():
184            ns.elements[fullname].modeltype = bool(modelset)
185    elif model in ("fullall", "fullonce"):
186        for (fullname, modelset) in elements.iteritems():
187            element = ns.elements[fullname]
188            if not modelset:
189                element.modeltype = "sims.Empty"
190            else:
191                elements = [el for el in modelset if isinstance(el, xnd.Element)]
192                if not elements:
193                    if "#text" in modelset:
194                        element.modeltype = "sims.NoElements"
195                    else:
196                        element.modeltype = "sims.NoElementsOrText"
197                else:
198                    if "#text" in modelset:
199                        element.modeltype = "sims.ElementsOrText"
200                    else:
201                        element.modeltype = "sims.Elements"
202                    element.modelargs = elements
203    else:
204        raise ValueError("unknown sims mode {!r}".format(model))
205
206    if shareattrs=="dupes":
207        ns.shareattrs(False)
208    elif shareattrs=="all":
209        ns.shareattrs(True)
210    return ns
211
212
213def main(args=None):
214    p = argparse.ArgumentParser(description="Convert XML files to XIST namespace (on stdout)")
215    p.add_argument("urls", metavar="urls", type=url.URL, help="URLs of XML files to be parsed (default stdin)", nargs="*")
216    p.add_argument("-p", "--parser", dest="parser", help="parser module to use for XML parsing (default: %(default)s)", choices=("etree", "lxml"), default="etree")
217    p.add_argument("-s", "--shareattrs", dest="shareattrs", help="Should identical attributes be shared among elements? (default: %(default)s)", choices=("none", "dupes", "all"), default="dupes")
218    p.add_argument("-m", "--model", dest="model", help="Create sims info? (default: %(default)s)", choices=("none", "simple", "fullall", "fullonce"), default="simple")
219    p.add_argument("-x", "--defaultxmlns", dest="defaultxmlns", metavar="NAME", help="Force elements without a namespace into this namespace")
220
221    args = p.parse_args(args)
222    print makexnd(**args.__dict__)
223
224
225if __name__ == "__main__":
226    sys.exit(main())
Note: See TracBrowser for help on using the browser.