Coverage for C:\leo.repo\leo-editor\leo\plugins\importers\xml.py : 91%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#@+leo-ver=5-thin
2#@+node:ekr.20140723122936.18137: * @file ../plugins/importers/xml.py
3"""The @auto importer for the xml language."""
4import re
5from leo.core import leoGlobals as g
6from leo.plugins.importers import linescanner
7Importer = linescanner.Importer
8Target = linescanner.Target
9#@+others
10#@+node:ekr.20161121204146.3: ** class Xml_Importer
11class Xml_Importer(Importer):
12 """The importer for the xml lanuage."""
14 #@+others
15 #@+node:ekr.20161122124109.1: *3* xml_i.__init__
16 def __init__(self, importCommands, tags_setting='import_xml_tags', **kwargs):
17 """Xml_Importer.__init__"""
18 # Init the base class.
19 super().__init__(
20 importCommands,
21 language='xml',
22 state_class=Xml_ScanState,
23 strict=False,
24 )
25 self.tags_setting = tags_setting
26 self.start_tags = self.add_tags()
27 self.stack = []
28 # Stack of tags.
29 # A closing tag decrements state.tag_level only if the top is an opening tag.
30 self.void_tags = [
31 '<?xml',
32 '!doctype',
33 ]
34 self.tag_warning_given = False
35 # True: a structure error has been detected.
36 # Only warn once.
37 #@+node:ekr.20161121204918.1: *3* xml_i.add_tags
38 def add_tags(self):
39 """Add items to self.class/functionTags and from settings."""
40 c, setting = self.c, self.tags_setting
41 aList = c.config.getData(setting) or []
42 aList = [z.lower() for z in aList]
43 return aList
44 #@+node:ekr.20170416082422.1: *3* xml_i.clean_headline
45 def clean_headline(self, s, p=None):
46 """xml and html: Return a cleaned up headline s."""
47 m = re.match(r'\s*(<[^>]+>)', s)
48 return m.group(1) if m else s.strip()
49 #@+node:ekr.20161123003732.1: *3* xml_i.error
50 def error(self, s):
51 """Issue an error, but do *not* cause a unit test to fail."""
52 g.es_print('\nin %s' % self.root.h)
53 g.es_print(s)
54 # Tell i.check to strip lws.
55 self.ws_error = True
56 #@+node:ekr.20161122073505.1: *3* xml_i.scan_line & helpers
57 def scan_line(self, s, prev_state):
58 """Update the xml scan state by scanning line s."""
59 context, tag_level = prev_state.context, prev_state.tag_level
60 i = 0
61 while i < len(s):
62 progress = i
63 if context:
64 context, i = self.scan_in_context(context, i, s)
65 else:
66 context, i, tag_level = self.scan_out_context(i, s, tag_level)
67 assert progress < i, (repr(s[i]), '***', repr(s))
68 d = {'context': context, 'tag_level': tag_level}
69 return Xml_ScanState(d)
70 #@+node:ekr.20161122073937.1: *4* xml_i.scan_in_context
71 def scan_in_context(self, context, i, s):
72 """
73 Scan s from i, within the given context.
74 Return (context, i)
75 """
76 assert context in ('"', '<!--'), repr(context)
77 # Only double-quoted strings are valid strings in xml/html.
78 if context == '"' and self.match(s, i, '"'):
79 context = ''
80 i += 1
81 elif context == '<!--' and self.match(s, i, '-->'):
82 context = ''
83 i += 3
84 else:
85 i += 1
86 return context, i
87 #@+node:ekr.20161122073938.1: *4* xml_i.scan_out_context & helpers
88 def scan_out_context(self, i, s, tag_level):
89 """
90 Scan s from i, outside any context.
91 Return (context, i, tag_level)
92 """
93 context = ''
94 if self.match(s, i, '"'):
95 context = '"' # Only double-quoted strings are xml/html strings.
96 i += 1
97 elif self.match(s, i, '<!--'):
98 context = '<!--'
99 i += 4
100 elif self.match(s, i, '<'):
101 # xml/html tags do *not* start contexts.
102 i, tag_level = self.scan_tag(s, i, tag_level)
103 elif self.match(s, i, '/>'):
104 i += 2
105 tag_level = self.end_tag(s, tag='/>', tag_level=tag_level)
106 elif self.match(s, i, '>'):
107 tag_level = self.end_tag(s, tag='>', tag_level=tag_level)
108 i += 1
109 else:
110 i += 1
111 return context, i, tag_level
112 #@+node:ekr.20161122084808.1: *5* xml_i.end_tag
113 def end_tag(self, s, tag, tag_level):
114 """
115 Handle the ">" or "/>" that ends an element.
117 Ignore ">" except for void tags.
118 """
119 if self.stack:
120 if tag == '/>':
121 top = self.stack.pop()
122 if top in self.start_tags:
123 tag_level -= 1
124 else:
125 top = self.stack[-1]
126 if top in self.void_tags:
127 self.stack.pop()
128 elif tag == '/>':
129 g.es_print("Warning: ignoring dubious /> in...")
130 g.es_print(repr(s))
131 return tag_level
132 #@+node:ekr.20161122080143.1: *5* xml_i.scan_tag & helper
133 ch_pattern = re.compile(r'([\!\?]?[\w\_\.\:\-]+)', re.UNICODE)
135 def scan_tag(self, s, i, tag_level):
136 """
137 Scan an xml tag starting with "<" or "</".
139 Adjust the stack as appropriate:
140 - "<" adds the tag to the stack.
141 - "</" removes the top of the stack if it matches.
142 """
143 assert s[i] == '<', repr(s[i])
144 end_tag = self.match(s, i, '</')
145 # Scan the tag.
146 i += (2 if end_tag else 1)
147 m = self.ch_pattern.match(s, i)
148 if m:
149 tag = m.group(0).lower()
150 i += len(m.group(0))
151 else:
152 # All other '<' characters should have had xml/html escapes applied to them.
153 self.error('missing tag in position %s of %r' % (i, s))
154 g.es_print(repr(s))
155 return i, tag_level
156 if end_tag:
157 self.pop_to_tag(tag, s)
158 if tag in self.start_tags:
159 tag_level -= 1
160 else:
161 self.stack.append(tag)
162 if tag in self.start_tags:
163 tag_level += 1
164 return i, tag_level
165 #@+node:ekr.20170416043508.1: *6* xml_i.pop_to_tag
166 def pop_to_tag(self, tag, s):
167 """
168 Attempt to pop tag from the top of the stack.
170 If the top doesn't match, issue a warning and attempt to recover.
171 """
172 if not self.stack:
173 self.error('Empty tag stack: %s' % tag)
174 g.es_print(repr(s))
175 return
176 top = self.stack[-1]
177 if top == tag:
178 self.stack.pop()
179 return
180 # Only issue one warning per file.
181 # Attempt a recovery.
182 if tag in self.stack:
183 while self.stack:
184 top = self.stack.pop()
185 # if trace: g.trace('POP: ', top)
186 if top == tag:
187 return
188 #@+node:ekr.20161121210839.1: *3* xml_i.starts_block
189 def starts_block(self, i, lines, new_state, prev_state):
190 """True if the line startswith an xml block"""
191 return new_state.tag_level > prev_state.tag_level
192 #@+node:ekr.20161121212858.1: *3* xml_i.is_ws_line
193 # Warning: base Importer class defines ws_pattern.
194 xml_ws_pattern = re.compile(r'\s*(<!--([^-]|-[^-])*-->\s*)*$')
196 def is_ws_line(self, s):
197 """True if s is nothing but whitespace or single-line comments."""
198 return bool(self.xml_ws_pattern.match(s))
199 #@+node:ekr.20161123005742.1: *3* xml_i.undent
200 def undent(self, p):
201 """
202 Regularize lws before @others, but preserve lws for all other lines.
203 This is needed to handle embedded brython code properly.
204 """
205 result, w = [], self.tab_width
206 indent = ' ' * abs(w) if w < 0 else '\t'
207 for s in self.get_lines(p):
208 ls = '\n' if s.isspace() else s.lstrip()
209 if ls.startswith('@others'):
210 if p == self.root:
211 result.append(ls)
212 else:
213 result.append(indent + ls)
214 else:
215 # Fix #479: Preserve brython indentation when importing .html files.
216 result.append('\n' if s.isspace() else s)
217 return result
218 #@-others
219#@+node:ekr.20161121204146.7: ** class class Xml_ScanState
220class Xml_ScanState:
221 """A class representing the state of the xml line-oriented scan."""
223 def __init__(self, d=None):
224 """Xml_ScanState.__init__"""
225 if d:
226 self.context = d.get('context')
227 self.tag_level = d.get('tag_level')
228 else:
229 self.context = ''
230 self.tag_level = 0
232 def __repr__(self):
233 """Xml_ScanState.__repr__"""
234 return "Xml_ScanState context: %r tag_level: %s" % (
235 self.context, self.tag_level)
237 __str__ = __repr__
239 #@+others
240 #@+node:ekr.20161121204146.8: *3* xml_state.level
241 def level(self):
242 """Xml_ScanState.level."""
243 return self.tag_level
244 #@-others
245#@-others
246importer_dict = {
247 'func': Xml_Importer.do_import(),
248 'extensions': ['.xml',],
249}
250#@@language python
251#@@tabwidth -4
253#@-leo