//[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
//[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
//[5] Name ::= NameStartChar (NameChar)*
var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF
var nameChar = new RegExp("[\\-\\.0-9" + nameStartChar.source.slice(1, -1) + "\u00B7\u0300-\u036F\\u203F-\u2040]");
var tagNamePattern = new RegExp('^' + nameStartChar.source + nameChar.source + '*(?:\:' + nameStartChar.source + nameChar.source + '*)?$');
//var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/
//var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
//S_TAG, S_ATTR, S_EQ, S_V
//S_ATTR_S, S_E, S_S, S_C
var S_TAG = 0;//tag name offerring
var S_ATTR = 1;//attr name offerring
var S_ATTR_S = 2;//attr name end and space offer
var S_EQ = 3;//=space?
var S_V = 4;//attr value(no quot value only)
var S_E = 5;//attr value end and no space(quot end)
var S_S = 6;//(attr value end || tag end ) && (space offer)
var S_C = 7;//closed el<el />
function XMLReader() {
}
XMLReader.prototype = {
parse: function (source, defaultNSMap, entityMap) {
var domBuilder = this.domBuilder;
domBuilder.startDocument();
_copy(defaultNSMap, defaultNSMap = {})
parse(source, defaultNSMap, entityMap,
domBuilder, this.errorHandler);
domBuilder.endDocument();
}
}
function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) {
function fixedFromCharCode(code) {
// String.prototype.fromCharCode does not supports
// > 2 bytes unicode chars directly
if (code > 0xffff) {
code -= 0x10000;
var surrogate1 = 0xd800 + (code >> 10)
, surrogate2 = 0xdc00 + (code & 0x3ff);
return String.fromCharCode(surrogate1, surrogate2);
} else {
return String.fromCharCode(code);
}
}
function entityReplacer(a) {
var k = a.slice(1, -1);
if (k in entityMap) {
return entityMap[k];
} else if (k.charAt(0) === '#') {
return fixedFromCharCode(parseInt(k.substr(1).replace('x', '0x')))
} else {
errorHandler.error('entity not found:' + a);
return a;
}
}
function appendText(end) {//has some bugs
if (end > start) {
var xt = source.substring(start, end).replace(/&#?\w+;/g, entityReplacer);
locator && position(start);
domBuilder.characters(xt, 0, end - start);
start = end
}
}
function position(p, m) {
while (p >= lineEnd && (m = linePattern.exec(source))) {
lineStart = m.index;
lineEnd = lineStart + m[0].length;
locator.lineNumber++;
////console.log('line++:',locator,startPos,endPos)
}
locator.columnNumber = p - lineStart + 1;
}
var lineStart = 0;
var lineEnd = 0;
var linePattern = /.+(?:\r\n?|\n)|.*$/g
var locator = domBuilder.locator;
var parseStack = [{currentNSMap: defaultNSMapCopy}]
var closeMap = {};
var start = 0;
while (true) {
try {
var tagStart = source.indexOf('<', start);
if (tagStart < 0) {
if (!source.substr(start).match(/^\s*$/)) {
var doc = domBuilder.document;
var text = doc.createTextNode(source.substr(start));
doc.appendChild(text);
domBuilder.currentElement = text;
}
return;
}
if (tagStart > start) {
appendText(tagStart);
}
switch (source.charAt(tagStart + 1)) {
case '/':
var end = source.indexOf('>', tagStart + 3);
var tagName = source.substring(tagStart + 2, end);
var config = parseStack.pop();
var localNSMap = config.localNSMap;
if (config.tagName != tagName) {
errorHandler.fatalError("end tag name: " + tagName + ' is not match the current start tagName:' + config.tagName);
}
domBuilder.endElement(config.uri, config.localName, tagName);
if (localNSMap) {
for (var prefix in localNSMap) {
domBuilder.endPrefixMapping(prefix);
}
}
end++;
break;
// end elment
case '?':// <?...?>
locator && position(tagStart);
end = parseInstruction(source, tagStart, domBuilder);
break;
case '!':// <!doctype,<![CDATA,<!--
locator && position(tagStart);
end = parseDCC(source, tagStart, domBuilder, errorHandler);
break;
default:
//console.log('locator', locator)
locator && position(tagStart);
var el = new ElementAttributes();
//elStartEnd
var end = parseElementStartPart(source, tagStart, el, entityReplacer, errorHandler);
//console.log('end', end)
var len = el.length;
if (locator) {
if (len) {
//attribute position fixed
for (var i = 0; i < len; i++) {
var a = el[i];
position(a.offset);
a.offset = copyLocator(locator, {});
}
}
position(end);
}
//console.log('el', el)
if (!el.closed && fixSelfClosed(source, end, el.tagName, closeMap)) {
el.closed = true;
if (!entityMap.nbsp) {
errorHandler.warning('unclosed xml attribute');
}
}
//console.log('parseStack', parseStack)
appendElement(el, domBuilder, parseStack);
//console.log('el', el, parseStack)
if (el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed) {
end = parseHtmlSpecialContent(source, end, el.tagName, entityReplacer, domBuilder)
} else {
end++;
}
}
} catch (e) {
errorHandler.error('element parse error: ' + e);
//console.log('element parse error: ', e)
end = -1;
}
//console.log('out', end, start)
if (end > start) {
start = end;
} else {
//TODO: 这里有可能sax回退,有位置错误风险
appendText(Math.max(tagStart, start) + 1);
}
}
}
function copyLocator(f, t) {
t.lineNumber = f.lineNumber;
t.columnNumber = f.columnNumber;
return t;
}
/**
* @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack);
* @return end of the elementStartPart(end of elementEndPart for selfClosed el)
*/
function parseElementStartPart(source, start, el, entityReplacer, errorHandler) {
var attrName;
var value;
var p = ++start;
var s = S_TAG;//status
while (true) {
var c = source.charAt(p);
switch (c) {
case '=':
if (s === S_ATTR) {//attrName
attrName = source.slice(start, p);
s = S_EQ;
} else if (s === S_ATTR_S) {
s = S_EQ;
} else {
//fatalError: equal must after attrName or space after attrName
throw new Error('attribute equal must after attrName');
}
break;
case '\'':
case '"':
if (s === S_EQ) {//equal
start = p + 1;
p = source.indexOf(c, start)
if (p > 0) {
value = source.slice(start, p).replace(/&#?\w+;/g, entityReplacer);
el.add(attrName, value, start - 1);
s = S_E;
} else {
//fatalError: no end quot match
throw new Error('attribute value no end \'' + c + '\' match');
}
} else if (s == S_V) {
value = source.slice(start, p).replace(/&#?\w+;/g, entityReplacer);
////console.log(attrName,value,start,p)
el.add(attrName, value, start);
//console.dir(el)
errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!');
start = p + 1;
s = S_E
} else {
//fatalError: no equal before
throw new Error('attribute value must after "="');
}
break;
case '/':
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));
case S_E:
case S_S:
case S_C:
s = S_C;
el.closed = true;
case S_V:
case S_ATTR:
case S_ATTR_S:
break;
//case S_EQ:
default:
throw new Error("attribute invalid close char('/')")
}
break;
case ''://end document
//throw new Error('unexpected end of input')
errorHandler.error('unexpected end of input');
case '>':
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));
case S_E:
case S_S:
case S_C:
break;//normal
case S_V://Compatible state
case S_ATTR:
value = source.slice(start, p);
if (value.slice(-1) === '/') {
el.closed = true;
value = value.slice(0, -1)
}
case S_ATTR_S:
if (s === S_ATTR_S) {
value = attrName;
}
if (s == S_V) {
errorHandler.warning('attribute "' + value + '" missed quot(")!!');
el.add(attrName, value.replace(/&#?\w+;/g, entityReplacer), start)
} else {
errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!')
el.add(value, value, start)
}
break;
case S_EQ:
throw new Error('attribute value missed!!');
}
// //console.log(tagName,tagNamePattern,tagNamePattern.test(tagName))
return p;
/*xml space '\x20' | #x9 | #xD | #xA; */
case '\u0080':
c = ' ';
default:
if (c <= ' ') {//space
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));//tagName
s = S_S;
break;
case S_ATTR:
attrName = source.slice(start, p)
s = S_ATTR_S;
break;
case S_V:
var value = source.slice(start, p).replace(/&#?\w+;/g, entityReplacer);
errorHandler.warning('attribute "' + value + '" missed quot(")!!');
el.add(attrName, value, start)
case S_E:
s = S_S;
break;
//case S_S:
//case S_EQ:
//case S_ATTR_S:
// void();break;
//case S_C:
//ignore warning
}
} else {//not space
//S_TAG, S_ATTR, S_EQ, S_V
//S_ATTR_S, S_E, S_S, S_C
switch (s) {
//case S_TAG:void();break;
//case S_ATTR:void();break;
//case S_V:void();break;
case S_ATTR_S:
errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead!!')
el.add(attrName, attrName, start);
start = p;
s = S_ATTR;
break;
case S_E:
errorHandler.warning('attribute space is required"' + attrName + '"!!')
case S_S:
s = S_ATTR;
start = p;
break;
case S_EQ:
s = S_V;
start = p;
break;
case S_C:
throw new Error("elements closed character '/' and '>' must be connected to");
}
}
}
p++;
}
}
/**
* @return end of the elementStartPart(end of elementEndPart for selfClosed el)
*/
function appendElement(el, domBuilder, parseStack) {
var tagName = el.tagName;
var localNSMap = null;
var currentNSMap = parseStack[parseStack.length - 1].currentNSMap;
var i = el.length;
while (i--) {
var a = el[i];
var qName = a.qName;
var value = a.value;
var nsp = qName.indexOf(':');
if (nsp > 0) {
var prefix = a.prefix = qName.slice(0, nsp);
var localName = qName.slice(nsp + 1);
var nsPrefix = prefix === 'xmlns' && localName
} else {
localName = qName;
prefix = null
nsPrefix = qName === 'xmlns' && ''
}
//can not set prefix,because prefix !== ''
a.localName = localName;
//prefix == null for no ns prefix attribute
if (nsPrefix !== false) {//hack!!
if (localNSMap == null) {
localNSMap = {}
////console.log(currentNSMap,0)
_copy(currentNSMap, currentNSMap = {})
////console.log(currentNSMap,1)
}
currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value;
a.uri = 'http://www.w3.org/2000/xmlns/'
domBuilder.startPrefixMapping(nsPrefix, value)
}
}
var i = el.length;
while (i--) {
a = el[i];
var prefix = a.prefix;
if (prefix) {//no prefix attribute has no namespace
if (prefix === 'xml') {
a.uri = 'http://www.w3.org/XML/1998/namespace';
}
if (prefix !== 'xmlns') {
a.uri = currentNSMap[prefix]
//{//console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)}
}
}
}
var nsp = tagName.indexOf(':');
if (nsp > 0) {
prefix = el.prefix = tagName.slice(0, nsp);
localName = el.localName = tagName.slice(nsp + 1);
} else {
prefix = null;//important!!
localName = el.localName = tagName;
}
//no prefix element has default namespace
var ns = el.uri = currentNSMap[prefix || ''];
domBuilder.startElement(ns, localName, tagName, el);
//endPrefixMapping and startPrefixMapping have not any help for dom builder
//localNSMap = null
if (el.closed) {
domBuilder.endElement(ns, localName, tagName);
if (localNSMap) {
for (prefix in localNSMap) {
domBuilder.endPrefixMapping(prefix)
}
}
} else {
el.currentNSMap = currentNSMap;
el.localNSMap = localNSMap;
parseStack.push(el);
}
}
function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) {
if (/^(?:script|textarea)$/i.test(tagName)) {
var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd);
var text = source.substring(elStartEnd + 1, elEndStart);
if (/[&<]/.test(text)) {
if (/^script$/i.test(tagName)) {
//if(!/\]\]>/.test(text)){
//lexHandler.startCDATA();
domBuilder.characters(text, 0, text.length);
//lexHandler.endCDATA();
return elEndStart;
//}
}//}else{//text area
text = text.replace(/&#?\w+;/g, entityReplacer);
domBuilder.characters(text, 0, text.length);
return elEndStart;
//}
}
}
return elStartEnd + 1;
}
function fixSelfClosed(source, elStartEnd, tagName, closeMap) {
//if(tagName in closeMap){
var pos = closeMap[tagName];
if (pos == null) {
////console.log(tagName)
pos = closeMap[tagName] = source.lastIndexOf('</' + tagName + '>')
}
return pos < elStartEnd;
//}
}
function _copy(source, target) {
for (var n in source) {
target[n] = source[n]
}
}
function parseDCC(source, start, domBuilder, errorHandler) {//sure start with '<!'
var next = source.charAt(start + 2)
switch (next) {
case '-':
if (source.charAt(start + 3) === '-') {
var end = source.indexOf('-->', start + 4);
//append comment source.substring(4,end)//<!--
if (end > start) {
domBuilder.comment(source, start + 4, end - start - 4);
return end + 3;
} else {
errorHandler.error("Unclosed comment");
return -1;
}
} else {
//error
return -1;
}
default:
if (source.substr(start + 3, 6) == 'CDATA[') {
var end = source.indexOf(']]>', start + 9);
domBuilder.startCDATA();
domBuilder.characters(source, start + 9, end - start - 9);
domBuilder.endCDATA()
return end + 3;
}
//<!DOCTYPE
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId)
var matchs = split(source, start);
var len = matchs.length;
if (len > 1 && /!doctype/i.test(matchs[0][0])) {
var name = matchs[1][0];
var pubid = len > 3 && /^public$/i.test(matchs[2][0]) && matchs[3][0]
var sysid = len > 4 && matchs[4][0];
var lastMatch = matchs[len - 1]
domBuilder.startDTD(name, pubid && pubid.replace(/^(['"])(.*?)\1$/, '$2'),
sysid && sysid.replace(/^(['"])(.*?)\1$/, '$2'));
domBuilder.endDTD();
return lastMatch.index + lastMatch[0].length
}
}
return -1;
}
function parseInstruction(source, start, domBuilder) {
var end = source.indexOf('?>', start);
if (end) {
var match = source.substring(start, end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/);
if (match) {
var len = match[0].length;
domBuilder.processingInstruction(match[1], match[2]);
return end + 2;
} else {//error
return -1;
}
}
return -1;
}
/**
* @param source
*/
function ElementAttributes(source) {
}
ElementAttributes.prototype = {
setTagName: function (tagName) {
if (!tagNamePattern.test(tagName)) {
throw new Error('invalid tagName:' + tagName)
}
this.tagName = tagName
},
add: function (qName, value, offset) {
if (!tagNamePattern.test(qName)) {
throw new Error('invalid attribute:' + qName)
}
this[this.length++] = {qName: qName, value: value, offset: offset}
},
length: 0,
getLocalName: function (i) {
return this[i].localName
},
getOffset: function (i) {
return this[i].offset
},
getQName: function (i) {
return this[i].qName
},
getURI: function (i) {
return this[i].uri
},
getValue: function (i) {
return this[i].value
}
// ,getIndex:function(uri, localName)){
// if(localName){
//
// }else{
// var qName = uri
// }
// },
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
// getType:function(uri,localName){}
// getType:function(i){},
}
function _set_proto_(thiz, parent) {
thiz.__proto__ = parent;
return thiz;
}
if (!(_set_proto_({}, _set_proto_.prototype) instanceof _set_proto_)) {
_set_proto_ = function (thiz, parent) {
function p() {
};
p.prototype = parent;
p = new p();
for (parent in thiz) {
p[parent] = thiz[parent];
}
return p;
}
}
function split(source, start) {
var match;
var buf = [];
var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g;
reg.lastIndex = start;
reg.exec(source);//skip <
while (match = reg.exec(source)) {
buf.push(match);
if (match[1])return buf;
}
}
if (typeof require == 'function') {
exports.XMLReader = XMLReader;
}