//[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] | |
//[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] | |
//[5] Name ::= NameStartChar (NameChar)* | |
var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF | |
var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\u00B7\u0300-\u036F\\u203F-\u2040]"); | |
var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$'); | |
//var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/ | |
//var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') | |
//S_TAG, S_ATTR, S_EQ, S_V | |
//S_ATTR_S, S_E, S_S, S_C | |
var S_TAG = 0;//tag name offerring | |
var S_ATTR = 1;//attr name offerring | |
var S_ATTR_S=2;//attr name end and space offer | |
var S_EQ = 3;//=space? | |
var S_V = 4;//attr value(no quot value only) | |
var S_E = 5;//attr value end and no space(quot end) | |
var S_S = 6;//(attr value end || tag end ) && (space offer) | |
var S_C = 7;//closed el<el /> | |
function XMLReader(){ | |
} | |
XMLReader.prototype = { | |
parse:function(source,defaultNSMap,entityMap){ | |
var domBuilder = this.domBuilder; | |
domBuilder.startDocument(); | |
_copy(defaultNSMap ,defaultNSMap = {}) | |
parse(source,defaultNSMap,entityMap, | |
domBuilder,this.errorHandler); | |
domBuilder.endDocument(); | |
} | |
} | |
function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){ | |
function fixedFromCharCode(code) { | |
// String.prototype.fromCharCode does not supports | |
// > 2 bytes unicode chars directly | |
if (code > 0xffff) { | |
code -= 0x10000; | |
var surrogate1 = 0xd800 + (code >> 10) | |
, surrogate2 = 0xdc00 + (code & 0x3ff); | |
return String.fromCharCode(surrogate1, surrogate2); | |
} else { | |
return String.fromCharCode(code); | |
} | |
} | |
function entityReplacer(a){ | |
var k = a.slice(1,-1); | |
if(k in entityMap){ | |
return entityMap[k]; | |
}else if(k.charAt(0) === '#'){ | |
return fixedFromCharCode(parseInt(k.substr(1).replace('x','0x'))) | |
}else{ | |
errorHandler.error('entity not found:'+a); | |
return a; | |
} | |
} | |
function appendText(end){//has some bugs | |
if(end>start){ | |
var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer); | |
locator&&position(start); | |
domBuilder.characters(xt,0,end-start); | |
start = end | |
} | |
} | |
function position(p,m){ | |
while(p>=lineEnd && (m = linePattern.exec(source))){ | |
lineStart = m.index; | |
lineEnd = lineStart + m[0].length; | |
locator.lineNumber++; | |
//console.log('line++:',locator,startPos,endPos) | |
} | |
locator.columnNumber = p-lineStart+1; | |
} | |
var lineStart = 0; | |
var lineEnd = 0; | |
var linePattern = /.+(?:\r\n?|\n)|.*$/g | |
var locator = domBuilder.locator; | |
var parseStack = [{currentNSMap:defaultNSMapCopy}] | |
var closeMap = {}; | |
var start = 0; | |
while(true){ | |
try{ | |
var tagStart = source.indexOf('<',start); | |
if(tagStart<0){ | |
if(!source.substr(start).match(/^\s*$/)){ | |
var doc = domBuilder.document; | |
var text = doc.createTextNode(source.substr(start)); | |
doc.appendChild(text); | |
domBuilder.currentElement = text; | |
} | |
return; | |
} | |
if(tagStart>start){ | |
appendText(tagStart); | |
} | |
switch(source.charAt(tagStart+1)){ | |
case '/': | |
var end = source.indexOf('>',tagStart+3); | |
var tagName = source.substring(tagStart+2,end); | |
var config = parseStack.pop(); | |
var localNSMap = config.localNSMap; | |
if(config.tagName != tagName){ | |
errorHandler.fatalError("end tag name: "+tagName+' is not match the current start tagName:'+config.tagName ); | |
} | |
domBuilder.endElement(config.uri,config.localName,tagName); | |
if(localNSMap){ | |
for(var prefix in localNSMap){ | |
domBuilder.endPrefixMapping(prefix) ; | |
} | |
} | |
end++; | |
break; | |
// end elment | |
case '?':// <?...?> | |
locator&&position(tagStart); | |
end = parseInstruction(source,tagStart,domBuilder); | |
break; | |
case '!':// <!doctype,<![CDATA,<!-- | |
locator&&position(tagStart); | |
end = parseDCC(source,tagStart,domBuilder,errorHandler); | |
break; | |
default: | |
locator&&position(tagStart); | |
var el = new ElementAttributes(); | |
//elStartEnd | |
var end = parseElementStartPart(source,tagStart,el,entityReplacer,errorHandler); | |
var len = el.length; | |
if(locator){ | |
if(len){ | |
//attribute position fixed | |
for(var i = 0;i<len;i++){ | |
var a = el[i]; | |
position(a.offset); | |
a.offset = copyLocator(locator,{}); | |
} | |
} | |
position(end); | |
} | |
if(!el.closed && fixSelfClosed(source,end,el.tagName,closeMap)){ | |
el.closed = true; | |
if(!entityMap.nbsp){ | |
errorHandler.warning('unclosed xml attribute'); | |
} | |
} | |
appendElement(el,domBuilder,parseStack); | |
if(el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed){ | |
end = parseHtmlSpecialContent(source,end,el.tagName,entityReplacer,domBuilder) | |
}else{ | |
end++; | |
} | |
} | |
}catch(e){ | |
errorHandler.error('element parse error: '+e); | |
end = -1; | |
} | |
if(end>start){ | |
start = end; | |
}else{ | |
//TODO: 这里有可能sax回退,有位置错误风险 | |
appendText(Math.max(tagStart,start)+1); | |
} | |
} | |
} | |
function copyLocator(f,t){ | |
t.lineNumber = f.lineNumber; | |
t.columnNumber = f.columnNumber; | |
return t; | |
} | |
/** | |
* @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack); | |
* @return end of the elementStartPart(end of elementEndPart for selfClosed el) | |
*/ | |
function parseElementStartPart(source,start,el,entityReplacer,errorHandler){ | |
var attrName; | |
var value; | |
var p = ++start; | |
var s = S_TAG;//status | |
while(true){ | |
var c = source.charAt(p); | |
switch(c){ | |
case '=': | |
if(s === S_ATTR){//attrName | |
attrName = source.slice(start,p); | |
s = S_EQ; | |
}else if(s === S_ATTR_S){ | |
s = S_EQ; | |
}else{ | |
//fatalError: equal must after attrName or space after attrName | |
throw new Error('attribute equal must after attrName'); | |
} | |
break; | |
case '\'': | |
case '"': | |
if(s === S_EQ){//equal | |
start = p+1; | |
p = source.indexOf(c,start) | |
if(p>0){ | |
value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); | |
el.add(attrName,value,start-1); | |
s = S_E; | |
}else{ | |
//fatalError: no end quot match | |
throw new Error('attribute value no end \''+c+'\' match'); | |
} | |
}else if(s == S_V){ | |
value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); | |
//console.log(attrName,value,start,p) | |
el.add(attrName,value,start); | |
//console.dir(el) | |
errorHandler.warning('attribute "'+attrName+'" missed start quot('+c+')!!'); | |
start = p+1; | |
s = S_E | |
}else{ | |
//fatalError: no equal before | |
throw new Error('attribute value must after "="'); | |
} | |
break; | |
case '/': | |
switch(s){ | |
case S_TAG: | |
el.setTagName(source.slice(start,p)); | |
case S_E: | |
case S_S: | |
case S_C: | |
s = S_C; | |
el.closed = true; | |
case S_V: | |
case S_ATTR: | |
case S_ATTR_S: | |
break; | |
//case S_EQ: | |
default: | |
throw new Error("attribute invalid close char('/')") | |
} | |
break; | |
case ''://end document | |
//throw new Error('unexpected end of input') | |
errorHandler.error('unexpected end of input'); | |
case '>': | |
switch(s){ | |
case S_TAG: | |
el.setTagName(source.slice(start,p)); | |
case S_E: | |
case S_S: | |
case S_C: | |
break;//normal | |
case S_V://Compatible state | |
case S_ATTR: | |
value = source.slice(start,p); | |
if(value.slice(-1) === '/'){ | |
el.closed = true; | |
value = value.slice(0,-1) | |
} | |
case S_ATTR_S: | |
if(s === S_ATTR_S){ | |
value = attrName; | |
} | |
if(s == S_V){ | |
errorHandler.warning('attribute "'+value+'" missed quot(")!!'); | |
el.add(attrName,value.replace(/&#?\w+;/g,entityReplacer),start) | |
}else{ | |
errorHandler.warning('attribute "'+value+'" missed value!! "'+value+'" instead!!') | |
el.add(value,value,start) | |
} | |
break; | |
case S_EQ: | |
throw new Error('attribute value missed!!'); | |
} | |
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) | |
return p; | |
/*xml space '\x20' | #x9 | #xD | #xA; */ | |
case '\u0080': | |
c = ' '; | |
default: | |
if(c<= ' '){//space | |
switch(s){ | |
case S_TAG: | |
el.setTagName(source.slice(start,p));//tagName | |
s = S_S; | |
break; | |
case S_ATTR: | |
attrName = source.slice(start,p) | |
s = S_ATTR_S; | |
break; | |
case S_V: | |
var value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); | |
errorHandler.warning('attribute "'+value+'" missed quot(")!!'); | |
el.add(attrName,value,start) | |
case S_E: | |
s = S_S; | |
break; | |
//case S_S: | |
//case S_EQ: | |
//case S_ATTR_S: | |
// void();break; | |
//case S_C: | |
//ignore warning | |
} | |
}else{//not space | |
//S_TAG, S_ATTR, S_EQ, S_V | |
//S_ATTR_S, S_E, S_S, S_C | |
switch(s){ | |
//case S_TAG:void();break; | |
//case S_ATTR:void();break; | |
//case S_V:void();break; | |
case S_ATTR_S: | |
errorHandler.warning('attribute "'+attrName+'" missed value!! "'+attrName+'" instead!!') | |
el.add(attrName,attrName,start); | |
start = p; | |
s = S_ATTR; | |
break; | |
case S_E: | |
errorHandler.warning('attribute space is required"'+attrName+'"!!') | |
case S_S: | |
s = S_ATTR; | |
start = p; | |
break; | |
case S_EQ: | |
s = S_V; | |
start = p; | |
break; | |
case S_C: | |
throw new Error("elements closed character '/' and '>' must be connected to"); | |
} | |
} | |
} | |
p++; | |
} | |
} | |
/** | |
* @return end of the elementStartPart(end of elementEndPart for selfClosed el) | |
*/ | |
function appendElement(el,domBuilder,parseStack){ | |
var tagName = el.tagName; | |
var localNSMap = null; | |
var currentNSMap = parseStack[parseStack.length-1].currentNSMap; | |
var i = el.length; | |
while(i--){ | |
var a = el[i]; | |
var qName = a.qName; | |
var value = a.value; | |
var nsp = qName.indexOf(':'); | |
if(nsp>0){ | |
var prefix = a.prefix = qName.slice(0,nsp); | |
var localName = qName.slice(nsp+1); | |
var nsPrefix = prefix === 'xmlns' && localName | |
}else{ | |
localName = qName; | |
prefix = null | |
nsPrefix = qName === 'xmlns' && '' | |
} | |
//can not set prefix,because prefix !== '' | |
a.localName = localName ; | |
//prefix == null for no ns prefix attribute | |
if(nsPrefix !== false){//hack!! | |
if(localNSMap == null){ | |
localNSMap = {} | |
//console.log(currentNSMap,0) | |
_copy(currentNSMap,currentNSMap={}) | |
//console.log(currentNSMap,1) | |
} | |
currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value; | |
a.uri = 'http://www.w3.org/2000/xmlns/' | |
domBuilder.startPrefixMapping(nsPrefix, value) | |
} | |
} | |
var i = el.length; | |
while(i--){ | |
a = el[i]; | |
var prefix = a.prefix; | |
if(prefix){//no prefix attribute has no namespace | |
if(prefix === 'xml'){ | |
a.uri = 'http://www.w3.org/XML/1998/namespace'; | |
}if(prefix !== 'xmlns'){ | |
a.uri = currentNSMap[prefix] | |
//{console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)} | |
} | |
} | |
} | |
var nsp = tagName.indexOf(':'); | |
if(nsp>0){ | |
prefix = el.prefix = tagName.slice(0,nsp); | |
localName = el.localName = tagName.slice(nsp+1); | |
}else{ | |
prefix = null;//important!! | |
localName = el.localName = tagName; | |
} | |
//no prefix element has default namespace | |
var ns = el.uri = currentNSMap[prefix || '']; | |
domBuilder.startElement(ns,localName,tagName,el); | |
//endPrefixMapping and startPrefixMapping have not any help for dom builder | |
//localNSMap = null | |
if(el.closed){ | |
domBuilder.endElement(ns,localName,tagName); | |
if(localNSMap){ | |
for(prefix in localNSMap){ | |
domBuilder.endPrefixMapping(prefix) | |
} | |
} | |
}else{ | |
el.currentNSMap = currentNSMap; | |
el.localNSMap = localNSMap; | |
parseStack.push(el); | |
} | |
} | |
function parseHtmlSpecialContent(source,elStartEnd,tagName,entityReplacer,domBuilder){ | |
if(/^(?:script|textarea)$/i.test(tagName)){ | |
var elEndStart = source.indexOf('</'+tagName+'>',elStartEnd); | |
var text = source.substring(elStartEnd+1,elEndStart); | |
if(/[&<]/.test(text)){ | |
if(/^script$/i.test(tagName)){ | |
//if(!/\]\]>/.test(text)){ | |
//lexHandler.startCDATA(); | |
domBuilder.characters(text,0,text.length); | |
//lexHandler.endCDATA(); | |
return elEndStart; | |
//} | |
}//}else{//text area | |
text = text.replace(/&#?\w+;/g,entityReplacer); | |
domBuilder.characters(text,0,text.length); | |
return elEndStart; | |
//} | |
} | |
} | |
return elStartEnd+1; | |
} | |
function fixSelfClosed(source,elStartEnd,tagName,closeMap){ | |
//if(tagName in closeMap){ | |
var pos = closeMap[tagName]; | |
if(pos == null){ | |
//console.log(tagName) | |
pos = closeMap[tagName] = source.lastIndexOf('</'+tagName+'>') | |
} | |
return pos<elStartEnd; | |
//} | |
} | |
function _copy(source,target){ | |
for(var n in source){target[n] = source[n]} | |
} | |
function parseDCC(source,start,domBuilder,errorHandler){//sure start with '<!' | |
var next= source.charAt(start+2) | |
switch(next){ | |
case '-': | |
if(source.charAt(start + 3) === '-'){ | |
var end = source.indexOf('-->',start+4); | |
//append comment source.substring(4,end)//<!-- | |
if(end>start){ | |
domBuilder.comment(source,start+4,end-start-4); | |
return end+3; | |
}else{ | |
errorHandler.error("Unclosed comment"); | |
return -1; | |
} | |
}else{ | |
//error | |
return -1; | |
} | |
default: | |
if(source.substr(start+3,6) == 'CDATA['){ | |
var end = source.indexOf(']]>',start+9); | |
domBuilder.startCDATA(); | |
domBuilder.characters(source,start+9,end-start-9); | |
domBuilder.endCDATA() | |
return end+3; | |
} | |
//<!DOCTYPE | |
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId) | |
var matchs = split(source,start); | |
var len = matchs.length; | |
if(len>1 && /!doctype/i.test(matchs[0][0])){ | |
var name = matchs[1][0]; | |
var pubid = len>3 && /^public$/i.test(matchs[2][0]) && matchs[3][0] | |
var sysid = len>4 && matchs[4][0]; | |
var lastMatch = matchs[len-1] | |
domBuilder.startDTD(name,pubid && pubid.replace(/^(['"])(.*?)\1$/,'$2'), | |
sysid && sysid.replace(/^(['"])(.*?)\1$/,'$2')); | |
domBuilder.endDTD(); | |
return lastMatch.index+lastMatch[0].length | |
} | |
} | |
return -1; | |
} | |
function parseInstruction(source,start,domBuilder){ | |
var end = source.indexOf('?>',start); | |
if(end){ | |
var match = source.substring(start,end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/); | |
if(match){ | |
var len = match[0].length; | |
domBuilder.processingInstruction(match[1], match[2]) ; | |
return end+2; | |
}else{//error | |
return -1; | |
} | |
} | |
return -1; | |
} | |
/** | |
* @param source | |
*/ | |
function ElementAttributes(source){ | |
} | |
ElementAttributes.prototype = { | |
setTagName:function(tagName){ | |
if(!tagNamePattern.test(tagName)){ | |
throw new Error('invalid tagName:'+tagName) | |
} | |
this.tagName = tagName | |
}, | |
add:function(qName,value,offset){ | |
if(!tagNamePattern.test(qName)){ | |
throw new Error('invalid attribute:'+qName) | |
} | |
this[this.length++] = {qName:qName,value:value,offset:offset} | |
}, | |
length:0, | |
getLocalName:function(i){return this[i].localName}, | |
getOffset:function(i){return this[i].offset}, | |
getQName:function(i){return this[i].qName}, | |
getURI:function(i){return this[i].uri}, | |
getValue:function(i){return this[i].value} | |
// ,getIndex:function(uri, localName)){ | |
// if(localName){ | |
// | |
// }else{ | |
// var qName = uri | |
// } | |
// }, | |
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, | |
// getType:function(uri,localName){} | |
// getType:function(i){}, | |
} | |
function _set_proto_(thiz,parent){ | |
thiz.__proto__ = parent; | |
return thiz; | |
} | |
if(!(_set_proto_({},_set_proto_.prototype) instanceof _set_proto_)){ | |
_set_proto_ = function(thiz,parent){ | |
function p(){}; | |
p.prototype = parent; | |
p = new p(); | |
for(parent in thiz){ | |
p[parent] = thiz[parent]; | |
} | |
return p; | |
} | |
} | |
function split(source,start){ | |
var match; | |
var buf = []; | |
var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g; | |
reg.lastIndex = start; | |
reg.exec(source);//skip < | |
while(match = reg.exec(source)){ | |
buf.push(match); | |
if(match[1])return buf; | |
} | |
} | |
if(typeof require == 'function'){ | |
exports.XMLReader = XMLReader; | |
} | |