root/livinglogic.python.xist/src/ll/xist/scripts/xml2xsc.py @ 4433:513c8e5bf375

Revision 4433:513c8e5bf375, 6.2 KB (checked in by Walter Doerwald <walter@…>, 8 years ago)

xml2xsc reads stdin if no URL is given.

Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4## Copyright 1999-2011 by LivingLogic AG, Bayreuth/Germany
5## Copyright 1999-2011 by Walter Dörwald
6##
7## All Rights Reserved
8##
9## See ll/__init__.py for the license
10
11
12"""
13``xml2xsc`` is a script that generates an XIST namespace module from one or more
14XML files. ``xml2xsc`` will output an XIST element class for each element it
15encounters in any of the XML files. The attributes and model information
16``xml2xsc`` assigns to an element will be collected from each occurence of the
17element in the XML files, so the XML files should cover as many different cases
18as possible.
19
20``xml2xsc`` supports the following options:
21
22    ``urls``
23        Zero or more URLs (or filenames) of XML files to be parsed. If no URL is
24        given stdin will be read.
25
26    ``-p``, ``--parser`` : ``etree`` or ``lxml``
27        Which XML parser should be used from parsing the XML files? (``etree`` is
28        the default, ``lxml`` requires that lxml_ is installed)
29
30    ``-s``, ``--shareattrs`` : ``none``, ``dupes``, ``all``
31        Should attributes be shared among the elements? ``none`` means that each
32        element will have its own standalone :class:`Attrs` class directly derived
33        from :class:`ll.xist.Elements.Attrs`. For ``dupes`` each attribute that is
34        used by more than one element will be moved into its own :class:`Attrs`
35        class. For ``all`` this will be done for all attributes.
36
37    ``-m``, ``--model`` : ``no``, ``simple``, ``fullall``, ``fullonce``
38        Add model information to the namespace. ``no`` doesn't add any model
39        information. ``simple`` only adds ``model = False`` or ``model = True``
40        (i.e. only the information whether the element must be empty or not).
41        ``fullall`` adds a :mod:`ll.xist.sims` model object to each element class.
42        ``fullonce`` adds full model information to, but reuses model objects for
43        elements which have the same model.
44
45    ``-x``, ``--defaultxmlns``
46        The default namespace name. All elements that don't belong to any
47        namespace will be assigned to this namespace.
48
49    .. _lxml: http://lxml.de/
50"""
51
52
53import sys, argparse, cStringIO
54
55from ll import misc, url
56from ll.xist import xsc, xnd, sims
57
58
59__docformat__ = "reStructuredText"
60
61
62def iterpath(node):
63    yield [node]
64    if hasattr(node, "text") and node.text:
65        yield [node, node.text]
66    if hasattr(node, "getchildren"):
67        for child in node:
68            for path in iterpath(child):
69                yield [node] + path
70    if hasattr(node, "tail") and node.tail:
71        yield [node, node.tail]
72
73
74def getelementname(node):
75    xmlns = None
76    name = node.tag
77    if name.startswith("{"):
78        (xmlns, sep, name) = name[1:].partition("}")
79    return (name, xmlns)
80
81
82def addetree2xnd(ns, node, elements):
83    # Iterate through the tree and collect which elements are encountered and how they are nested
84    for path in iterpath(node):
85        node = path[-1]
86        if "Element" in type(node).__name__:
87            (name, xmlns) = getelementname(node)
88            if (name, xmlns) in ns.elements:
89                xndnode = ns.elements[(name, xmlns)]
90            else:
91                xndnode = xnd.Element(name, xmlns=xmlns)
92                ns += xndnode
93                elements[(name, xmlns)] = set()
94            for attrname in node.keys():
95                if not attrname.startswith("{") and attrname not in xndnode.attrs:
96                    xndnode += xnd.Attr(attrname, type=xsc.TextAttr)
97        elif "ProcessingInstruction" in type(node).__name__:
98            name = node.target
99            if name not in ns.procinsts:
100                ns += xnd.ProcInst(name)
101        elif "Comment" in type(node).__name__:
102            xndnode = "#comment"
103        elif isinstance(node, basestring):
104            if node.isspace():
105                xndnode = "#whitespace"
106            else:
107                xndnode = "#text"
108        if len(path) >= 2:
109            parent = path[-2]
110            if "Element" in type(parent).__name__:
111                parententry = elements[getelementname(parent)]
112                parententry.add(xndnode)
113
114
115def makexnd(urls, parser="etree", shareattrs="dupes", model="simple", defaultxmlns=None):
116    elements = {} # maps (name, xmlns) to content set
117    ns = xnd.Module(defaultxmlns=defaultxmlns, model=model)
118    with url.Context():
119        if not urls:
120            urls = [sys.stdin]
121        for u in urls:
122            if isinstance(u, url.URL):
123                u = u.openread()
124            elif isinstance(u, str):
125                u = cStringIO.StringIO(u)
126            if parser == "etree":
127                from xml.etree import cElementTree
128                node = cElementTree.parse(u).getroot()
129            elif parser == "lxml":
130                from lxml import etree
131                node = etree.parse(u).getroot()
132            else:
133                raise ValueError("unknown parser {!r}".format(parser))
134            addetree2xnd(ns, node, elements)
135
136    # Put sims info into the element definitions
137    if model == "none":
138        pass
139    elif model == "simple":
140        for (fullname, modelset) in elements.iteritems():
141            ns.elements[fullname].modeltype = bool(modelset)
142    elif model in ("fullall", "fullonce"):
143        for (fullname, modelset) in elements.iteritems():
144            element = ns.elements[fullname]
145            if not modelset:
146                element.modeltype = "sims.Empty"
147            else:
148                elements = [el for el in modelset if isinstance(el, xnd.Element)]
149                if not elements:
150                    if "#text" in modelset:
151                        element.modeltype = "sims.NoElements"
152                    else:
153                        element.modeltype = "sims.NoElementsOrText"
154                else:
155                    if "#text" in modelset:
156                        element.modeltype = "sims.ElementsOrText"
157                    else:
158                        element.modeltype = "sims.Elements"
159                    element.modelargs = elements
160    else:
161        raise ValueError("unknown sims mode {!r}".format(model))
162
163    if shareattrs=="dupes":
164        ns.shareattrs(False)
165    elif shareattrs=="all":
166        ns.shareattrs(True)
167    return ns
168
169
170def main(args=None):
171    p = argparse.ArgumentParser(description="Convert XML files to XIST namespace (on stdout)")
172    p.add_argument("urls", metavar="urls", type=url.URL, help="URLs of XML files to be parsed (default stdin)", nargs="*")
173    p.add_argument("-p", "--parser", dest="parser", help="parser module to use for XML parsing (default: %(default)s)", choices=("etree", "lxml"), default="etree")
174    p.add_argument("-s", "--shareattrs", dest="shareattrs", help="Should identical attributes be shared among elements? (default: %(default)s)", choices=("none", "dupes", "all"), default="dupes")
175    p.add_argument("-m", "--model", dest="model", help="Create sims info? (default: %(default)s)", choices=("none", "simple", "fullall", "fullonce"), default="simple")
176    p.add_argument("-x", "--defaultxmlns", dest="defaultxmlns", metavar="NAME", help="Force elements without a namespace into this namespace")
177
178    args = p.parse_args(args)
179    print makexnd(**args.__dict__)
180
181
182if __name__ == "__main__":
183    sys.exit(main())
Note: See TracBrowser for help on using the browser.