//[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] //[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] //[5] Name ::= NameStartChar (NameChar)* var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF var nameChar = new RegExp("[\\-\\.0-9" + nameStartChar.source.slice(1, -1) + "\u00B7\u0300-\u036F\\u203F-\u2040]"); var tagNamePattern = new RegExp('^' + nameStartChar.source + nameChar.source + '*(?:\:' + nameStartChar.source + nameChar.source + '*)?$'); //var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/ //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') //S_TAG, S_ATTR, S_EQ, S_V //S_ATTR_S, S_E, S_S, S_C var S_TAG = 0;//tag name offerring var S_ATTR = 1;//attr name offerring var S_ATTR_S = 2;//attr name end and space offer var S_EQ = 3;//=space? var S_V = 4;//attr value(no quot value only) var S_E = 5;//attr value end and no space(quot end) var S_S = 6;//(attr value end || tag end ) && (space offer) var S_C = 7;//closed el function XMLReader() { } XMLReader.prototype = { parse: function (source, defaultNSMap, entityMap) { var domBuilder = this.domBuilder; domBuilder.startDocument(); _copy(defaultNSMap, defaultNSMap = {}) parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler); domBuilder.endDocument(); } } function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) { function fixedFromCharCode(code) { // String.prototype.fromCharCode does not supports // > 2 bytes unicode chars directly if (code > 0xffff) { code -= 0x10000; var surrogate1 = 0xd800 + (code >> 10) , surrogate2 = 0xdc00 + (code & 0x3ff); return String.fromCharCode(surrogate1, surrogate2); } else { return String.fromCharCode(code); } } function entityReplacer(a) { var k = a.slice(1, -1); if (k in entityMap) { return entityMap[k]; } else if (k.charAt(0) === '#') { return fixedFromCharCode(parseInt(k.substr(1).replace('x', '0x'))) } else { errorHandler.error('entity not found:' + a); return a; } } function appendText(end) {//has some bugs if (end > start) { var xt = source.substring(start, end).replace(/&#?\w+;/g, entityReplacer); locator && position(start); domBuilder.characters(xt, 0, end - start); start = end } } function position(p, m) { while (p >= lineEnd && (m = linePattern.exec(source))) { lineStart = m.index; lineEnd = lineStart + m[0].length; locator.lineNumber++; ////console.log('line++:',locator,startPos,endPos) } locator.columnNumber = p - lineStart + 1; } var lineStart = 0; var lineEnd = 0; var linePattern = /.+(?:\r\n?|\n)|.*$/g var locator = domBuilder.locator; var parseStack = [{currentNSMap: defaultNSMapCopy}] var closeMap = {}; var start = 0; while (true) { try { var tagStart = source.indexOf('<', start); if (tagStart < 0) { if (!source.substr(start).match(/^\s*$/)) { var doc = domBuilder.document; var text = doc.createTextNode(source.substr(start)); doc.appendChild(text); domBuilder.currentElement = text; } return; } if (tagStart > start) { appendText(tagStart); } switch (source.charAt(tagStart + 1)) { case '/': var end = source.indexOf('>', tagStart + 3); var tagName = source.substring(tagStart + 2, end); var config = parseStack.pop(); var localNSMap = config.localNSMap; if (config.tagName != tagName) { errorHandler.fatalError("end tag name: " + tagName + ' is not match the current start tagName:' + config.tagName); } domBuilder.endElement(config.uri, config.localName, tagName); if (localNSMap) { for (var prefix in localNSMap) { domBuilder.endPrefixMapping(prefix); } } end++; break; // end elment case '?':// locator && position(tagStart); end = parseInstruction(source, tagStart, domBuilder); break; case '!':// start) { start = end; } else { //TODO: 这里有可能sax回退,有位置错误风险 appendText(Math.max(tagStart, start) + 1); } } } function copyLocator(f, t) { t.lineNumber = f.lineNumber; t.columnNumber = f.columnNumber; return t; } /** * @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack); * @return end of the elementStartPart(end of elementEndPart for selfClosed el) */ function parseElementStartPart(source, start, el, entityReplacer, errorHandler) { var attrName; var value; var p = ++start; var s = S_TAG;//status while (true) { var c = source.charAt(p); switch (c) { case '=': if (s === S_ATTR) {//attrName attrName = source.slice(start, p); s = S_EQ; } else if (s === S_ATTR_S) { s = S_EQ; } else { //fatalError: equal must after attrName or space after attrName throw new Error('attribute equal must after attrName'); } break; case '\'': case '"': if (s === S_EQ) {//equal start = p + 1; p = source.indexOf(c, start) if (p > 0) { value = source.slice(start, p).replace(/&#?\w+;/g, entityReplacer); el.add(attrName, value, start - 1); s = S_E; } else { //fatalError: no end quot match throw new Error('attribute value no end \'' + c + '\' match'); } } else if (s == S_V) { value = source.slice(start, p).replace(/&#?\w+;/g, entityReplacer); ////console.log(attrName,value,start,p) el.add(attrName, value, start); //console.dir(el) errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!'); start = p + 1; s = S_E } else { //fatalError: no equal before throw new Error('attribute value must after "="'); } break; case '/': switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); case S_E: case S_S: case S_C: s = S_C; el.closed = true; case S_V: case S_ATTR: case S_ATTR_S: break; //case S_EQ: default: throw new Error("attribute invalid close char('/')") } break; case ''://end document //throw new Error('unexpected end of input') errorHandler.error('unexpected end of input'); case '>': switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); case S_E: case S_S: case S_C: break;//normal case S_V://Compatible state case S_ATTR: value = source.slice(start, p); if (value.slice(-1) === '/') { el.closed = true; value = value.slice(0, -1) } case S_ATTR_S: if (s === S_ATTR_S) { value = attrName; } if (s == S_V) { errorHandler.warning('attribute "' + value + '" missed quot(")!!'); el.add(attrName, value.replace(/&#?\w+;/g, entityReplacer), start) } else { errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!') el.add(value, value, start) } break; case S_EQ: throw new Error('attribute value missed!!'); } // //console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) return p; /*xml space '\x20' | #x9 | #xD | #xA; */ case '\u0080': c = ' '; default: if (c <= ' ') {//space switch (s) { case S_TAG: el.setTagName(source.slice(start, p));//tagName s = S_S; break; case S_ATTR: attrName = source.slice(start, p) s = S_ATTR_S; break; case S_V: var value = source.slice(start, p).replace(/&#?\w+;/g, entityReplacer); errorHandler.warning('attribute "' + value + '" missed quot(")!!'); el.add(attrName, value, start) case S_E: s = S_S; break; //case S_S: //case S_EQ: //case S_ATTR_S: // void();break; //case S_C: //ignore warning } } else {//not space //S_TAG, S_ATTR, S_EQ, S_V //S_ATTR_S, S_E, S_S, S_C switch (s) { //case S_TAG:void();break; //case S_ATTR:void();break; //case S_V:void();break; case S_ATTR_S: errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead!!') el.add(attrName, attrName, start); start = p; s = S_ATTR; break; case S_E: errorHandler.warning('attribute space is required"' + attrName + '"!!') case S_S: s = S_ATTR; start = p; break; case S_EQ: s = S_V; start = p; break; case S_C: throw new Error("elements closed character '/' and '>' must be connected to"); } } } p++; } } /** * @return end of the elementStartPart(end of elementEndPart for selfClosed el) */ function appendElement(el, domBuilder, parseStack) { var tagName = el.tagName; var localNSMap = null; var currentNSMap = parseStack[parseStack.length - 1].currentNSMap; var i = el.length; while (i--) { var a = el[i]; var qName = a.qName; var value = a.value; var nsp = qName.indexOf(':'); if (nsp > 0) { var prefix = a.prefix = qName.slice(0, nsp); var localName = qName.slice(nsp + 1); var nsPrefix = prefix === 'xmlns' && localName } else { localName = qName; prefix = null nsPrefix = qName === 'xmlns' && '' } //can not set prefix,because prefix !== '' a.localName = localName; //prefix == null for no ns prefix attribute if (nsPrefix !== false) {//hack!! if (localNSMap == null) { localNSMap = {} ////console.log(currentNSMap,0) _copy(currentNSMap, currentNSMap = {}) ////console.log(currentNSMap,1) } currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value; a.uri = 'http://www.w3.org/2000/xmlns/' domBuilder.startPrefixMapping(nsPrefix, value) } } var i = el.length; while (i--) { a = el[i]; var prefix = a.prefix; if (prefix) {//no prefix attribute has no namespace if (prefix === 'xml') { a.uri = 'http://www.w3.org/XML/1998/namespace'; } if (prefix !== 'xmlns') { a.uri = currentNSMap[prefix] //{//console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)} } } } var nsp = tagName.indexOf(':'); if (nsp > 0) { prefix = el.prefix = tagName.slice(0, nsp); localName = el.localName = tagName.slice(nsp + 1); } else { prefix = null;//important!! localName = el.localName = tagName; } //no prefix element has default namespace var ns = el.uri = currentNSMap[prefix || '']; domBuilder.startElement(ns, localName, tagName, el); //endPrefixMapping and startPrefixMapping have not any help for dom builder //localNSMap = null if (el.closed) { domBuilder.endElement(ns, localName, tagName); if (localNSMap) { for (prefix in localNSMap) { domBuilder.endPrefixMapping(prefix) } } } else { el.currentNSMap = currentNSMap; el.localNSMap = localNSMap; parseStack.push(el); } } function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) { if (/^(?:script|textarea)$/i.test(tagName)) { var elEndStart = source.indexOf('', elStartEnd); var text = source.substring(elStartEnd + 1, elEndStart); if (/[&<]/.test(text)) { if (/^script$/i.test(tagName)) { //if(!/\]\]>/.test(text)){ //lexHandler.startCDATA(); domBuilder.characters(text, 0, text.length); //lexHandler.endCDATA(); return elEndStart; //} }//}else{//text area text = text.replace(/&#?\w+;/g, entityReplacer); domBuilder.characters(text, 0, text.length); return elEndStart; //} } } return elStartEnd + 1; } function fixSelfClosed(source, elStartEnd, tagName, closeMap) { //if(tagName in closeMap){ var pos = closeMap[tagName]; if (pos == null) { ////console.log(tagName) pos = closeMap[tagName] = source.lastIndexOf('') } return pos < elStartEnd; //} } function _copy(source, target) { for (var n in source) { target[n] = source[n] } } function parseDCC(source, start, domBuilder, errorHandler) {//sure start with '', start + 4); //append comment source.substring(4,end)//