| //[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] |
| //[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] |
| //[5] Name ::= NameStartChar (NameChar)* |
| var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF |
| var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\\u00B7\\u0300-\\u036F\\u203F-\\u2040]"); |
| var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$'); |
| //var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/ |
| //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') |
| |
| //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE |
| //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE |
| var S_TAG = 0;//tag name offerring |
| var S_ATTR = 1;//attr name offerring |
| var S_ATTR_SPACE=2;//attr name end and space offer |
| var S_EQ = 3;//=space? |
| var S_ATTR_NOQUOT_VALUE = 4;//attr value(no quot value only) |
| var S_ATTR_END = 5;//attr value end and no space(quot end) |
| var S_TAG_SPACE = 6;//(attr value end || tag end ) && (space offer) |
| var S_TAG_CLOSE = 7;//closed el<el /> |
| |
| function XMLReader(){ |
| |
| } |
| |
| XMLReader.prototype = { |
| parse:function(source,defaultNSMap,entityMap){ |
| var domBuilder = this.domBuilder; |
| domBuilder.startDocument(); |
| _copy(defaultNSMap ,defaultNSMap = {}) |
| parse(source,defaultNSMap,entityMap, |
| domBuilder,this.errorHandler); |
| domBuilder.endDocument(); |
| } |
| } |
| function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){ |
| function fixedFromCharCode(code) { |
| // String.prototype.fromCharCode does not supports |
| // > 2 bytes unicode chars directly |
| if (code > 0xffff) { |
| code -= 0x10000; |
| var surrogate1 = 0xd800 + (code >> 10) |
| , surrogate2 = 0xdc00 + (code & 0x3ff); |
| |
| return String.fromCharCode(surrogate1, surrogate2); |
| } else { |
| return String.fromCharCode(code); |
| } |
| } |
| function entityReplacer(a){ |
| var k = a.slice(1,-1); |
| if(k in entityMap){ |
| return entityMap[k]; |
| }else if(k.charAt(0) === '#'){ |
| return fixedFromCharCode(parseInt(k.substr(1).replace('x','0x'))) |
| }else{ |
| errorHandler.error('entity not found:'+a); |
| return a; |
| } |
| } |
| function appendText(end){//has some bugs |
| if(end>start){ |
| var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer); |
| locator&&position(start); |
| domBuilder.characters(xt,0,end-start); |
| start = end |
| } |
| } |
| function position(p,m){ |
| while(p>=lineEnd && (m = linePattern.exec(source))){ |
| lineStart = m.index; |
| lineEnd = lineStart + m[0].length; |
| locator.lineNumber++; |
| //console.log('line++:',locator,startPos,endPos) |
| } |
| locator.columnNumber = p-lineStart+1; |
| } |
| var lineStart = 0; |
| var lineEnd = 0; |
| var linePattern = /.*(?:\r\n?|\n)|.*$/g |
| var locator = domBuilder.locator; |
| |
| var parseStack = [{currentNSMap:defaultNSMapCopy}] |
| var closeMap = {}; |
| var start = 0; |
| while(true){ |
| try{ |
| var tagStart = source.indexOf('<',start); |
| if(tagStart<0){ |
| if(!source.substr(start).match(/^\s*$/)){ |
| var doc = domBuilder.doc; |
| var text = doc.createTextNode(source.substr(start)); |
| doc.appendChild(text); |
| domBuilder.currentElement = text; |
| } |
| return; |
| } |
| if(tagStart>start){ |
| appendText(tagStart); |
| } |
| switch(source.charAt(tagStart+1)){ |
| case '/': |
| var end = source.indexOf('>',tagStart+3); |
| var tagName = source.substring(tagStart+2,end); |
| var config = parseStack.pop(); |
| if(end<0){ |
| |
| tagName = source.substring(tagStart+2).replace(/[\s<].*/,''); |
| //console.error('#@@@@@@'+tagName) |
| errorHandler.error("end tag name: "+tagName+' is not complete:'+config.tagName); |
| end = tagStart+1+tagName.length; |
| }else if(tagName.match(/\s</)){ |
| tagName = tagName.replace(/[\s<].*/,''); |
| errorHandler.error("end tag name: "+tagName+' maybe not complete'); |
| end = tagStart+1+tagName.length; |
| } |
| //console.error(parseStack.length,parseStack) |
| //console.error(config); |
| var localNSMap = config.localNSMap; |
| var endMatch = config.tagName == tagName; |
| var endIgnoreCaseMach = endMatch || config.tagName&&config.tagName.toLowerCase() == tagName.toLowerCase() |
| if(endIgnoreCaseMach){ |
| domBuilder.endElement(config.uri,config.localName,tagName); |
| if(localNSMap){ |
| for(var prefix in localNSMap){ |
| domBuilder.endPrefixMapping(prefix) ; |
| } |
| } |
| if(!endMatch){ |
| errorHandler.fatalError("end tag name: "+tagName+' is not match the current start tagName:'+config.tagName ); |
| } |
| }else{ |
| parseStack.push(config) |
| } |
| |
| end++; |
| break; |
| // end elment |
| case '?':// <?...?> |
| locator&&position(tagStart); |
| end = parseInstruction(source,tagStart,domBuilder); |
| break; |
| case '!':// <!doctype,<![CDATA,<!-- |
| locator&&position(tagStart); |
| end = parseDCC(source,tagStart,domBuilder,errorHandler); |
| break; |
| default: |
| locator&&position(tagStart); |
| var el = new ElementAttributes(); |
| var currentNSMap = parseStack[parseStack.length-1].currentNSMap; |
| //elStartEnd |
| var end = parseElementStartPart(source,tagStart,el,currentNSMap,entityReplacer,errorHandler); |
| var len = el.length; |
| |
| |
| if(!el.closed && fixSelfClosed(source,end,el.tagName,closeMap)){ |
| el.closed = true; |
| if(!entityMap.nbsp){ |
| errorHandler.warning('unclosed xml attribute'); |
| } |
| } |
| if(locator && len){ |
| var locator2 = copyLocator(locator,{}); |
| //try{//attribute position fixed |
| for(var i = 0;i<len;i++){ |
| var a = el[i]; |
| position(a.offset); |
| a.locator = copyLocator(locator,{}); |
| } |
| //}catch(e){console.error('@@@@@'+e)} |
| domBuilder.locator = locator2 |
| if(appendElement(el,domBuilder,currentNSMap)){ |
| parseStack.push(el) |
| } |
| domBuilder.locator = locator; |
| }else{ |
| if(appendElement(el,domBuilder,currentNSMap)){ |
| parseStack.push(el) |
| } |
| } |
| |
| |
| |
| if(el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed){ |
| end = parseHtmlSpecialContent(source,end,el.tagName,entityReplacer,domBuilder) |
| }else{ |
| end++; |
| } |
| } |
| }catch(e){ |
| errorHandler.error('element parse error: '+e) |
| //errorHandler.error('element parse error: '+e); |
| end = -1; |
| //throw e; |
| } |
| if(end>start){ |
| start = end; |
| }else{ |
| //TODO: 这里有可能sax回退,有位置错误风险 |
| appendText(Math.max(tagStart,start)+1); |
| } |
| } |
| } |
| function copyLocator(f,t){ |
| t.lineNumber = f.lineNumber; |
| t.columnNumber = f.columnNumber; |
| return t; |
| } |
| |
| /** |
| * @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack); |
| * @return end of the elementStartPart(end of elementEndPart for selfClosed el) |
| */ |
| function parseElementStartPart(source,start,el,currentNSMap,entityReplacer,errorHandler){ |
| var attrName; |
| var value; |
| var p = ++start; |
| var s = S_TAG;//status |
| while(true){ |
| var c = source.charAt(p); |
| switch(c){ |
| case '=': |
| if(s === S_ATTR){//attrName |
| attrName = source.slice(start,p); |
| s = S_EQ; |
| }else if(s === S_ATTR_SPACE){ |
| s = S_EQ; |
| }else{ |
| //fatalError: equal must after attrName or space after attrName |
| throw new Error('attribute equal must after attrName'); |
| } |
| break; |
| case '\'': |
| case '"': |
| if(s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE |
| ){//equal |
| if(s === S_ATTR){ |
| errorHandler.warning('attribute value must after "="') |
| attrName = source.slice(start,p) |
| } |
| start = p+1; |
| p = source.indexOf(c,start) |
| if(p>0){ |
| value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); |
| el.add(attrName,value,start-1); |
| s = S_ATTR_END; |
| }else{ |
| //fatalError: no end quot match |
| throw new Error('attribute value no end \''+c+'\' match'); |
| } |
| }else if(s == S_ATTR_NOQUOT_VALUE){ |
| value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); |
| //console.log(attrName,value,start,p) |
| el.add(attrName,value,start); |
| //console.dir(el) |
| errorHandler.warning('attribute "'+attrName+'" missed start quot('+c+')!!'); |
| start = p+1; |
| s = S_ATTR_END |
| }else{ |
| //fatalError: no equal before |
| throw new Error('attribute value must after "="'); |
| } |
| break; |
| case '/': |
| switch(s){ |
| case S_TAG: |
| el.setTagName(source.slice(start,p)); |
| case S_ATTR_END: |
| case S_TAG_SPACE: |
| case S_TAG_CLOSE: |
| s =S_TAG_CLOSE; |
| el.closed = true; |
| case S_ATTR_NOQUOT_VALUE: |
| case S_ATTR: |
| case S_ATTR_SPACE: |
| break; |
| //case S_EQ: |
| default: |
| throw new Error("attribute invalid close char('/')") |
| } |
| break; |
| case ''://end document |
| //throw new Error('unexpected end of input') |
| errorHandler.error('unexpected end of input'); |
| if(s == S_TAG){ |
| el.setTagName(source.slice(start,p)); |
| } |
| return p; |
| case '>': |
| switch(s){ |
| case S_TAG: |
| el.setTagName(source.slice(start,p)); |
| case S_ATTR_END: |
| case S_TAG_SPACE: |
| case S_TAG_CLOSE: |
| break;//normal |
| case S_ATTR_NOQUOT_VALUE://Compatible state |
| case S_ATTR: |
| value = source.slice(start,p); |
| if(value.slice(-1) === '/'){ |
| el.closed = true; |
| value = value.slice(0,-1) |
| } |
| case S_ATTR_SPACE: |
| if(s === S_ATTR_SPACE){ |
| value = attrName; |
| } |
| if(s == S_ATTR_NOQUOT_VALUE){ |
| errorHandler.warning('attribute "'+value+'" missed quot(")!!'); |
| el.add(attrName,value.replace(/&#?\w+;/g,entityReplacer),start) |
| }else{ |
| if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)){ |
| errorHandler.warning('attribute "'+value+'" missed value!! "'+value+'" instead!!') |
| } |
| el.add(value,value,start) |
| } |
| break; |
| case S_EQ: |
| throw new Error('attribute value missed!!'); |
| } |
| // console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) |
| return p; |
| /*xml space '\x20' | #x9 | #xD | #xA; */ |
| case '\u0080': |
| c = ' '; |
| default: |
| if(c<= ' '){//space |
| switch(s){ |
| case S_TAG: |
| el.setTagName(source.slice(start,p));//tagName |
| s = S_TAG_SPACE; |
| break; |
| case S_ATTR: |
| attrName = source.slice(start,p) |
| s = S_ATTR_SPACE; |
| break; |
| case S_ATTR_NOQUOT_VALUE: |
| var value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); |
| errorHandler.warning('attribute "'+value+'" missed quot(")!!'); |
| el.add(attrName,value,start) |
| case S_ATTR_END: |
| s = S_TAG_SPACE; |
| break; |
| //case S_TAG_SPACE: |
| //case S_EQ: |
| //case S_ATTR_SPACE: |
| // void();break; |
| //case S_TAG_CLOSE: |
| //ignore warning |
| } |
| }else{//not space |
| //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE |
| //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE |
| switch(s){ |
| //case S_TAG:void();break; |
| //case S_ATTR:void();break; |
| //case S_ATTR_NOQUOT_VALUE:void();break; |
| case S_ATTR_SPACE: |
| var tagName = el.tagName; |
| if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)){ |
| errorHandler.warning('attribute "'+attrName+'" missed value!! "'+attrName+'" instead2!!') |
| } |
| el.add(attrName,attrName,start); |
| start = p; |
| s = S_ATTR; |
| break; |
| case S_ATTR_END: |
| errorHandler.warning('attribute space is required"'+attrName+'"!!') |
| case S_TAG_SPACE: |
| s = S_ATTR; |
| start = p; |
| break; |
| case S_EQ: |
| s = S_ATTR_NOQUOT_VALUE; |
| start = p; |
| break; |
| case S_TAG_CLOSE: |
| throw new Error("elements closed character '/' and '>' must be connected to"); |
| } |
| } |
| }//end outer switch |
| //console.log('p++',p) |
| p++; |
| } |
| } |
| /** |
| * @return true if has new namespace define |
| */ |
| function appendElement(el,domBuilder,currentNSMap){ |
| var tagName = el.tagName; |
| var localNSMap = null; |
| //var currentNSMap = parseStack[parseStack.length-1].currentNSMap; |
| var i = el.length; |
| while(i--){ |
| var a = el[i]; |
| var qName = a.qName; |
| var value = a.value; |
| var nsp = qName.indexOf(':'); |
| if(nsp>0){ |
| var prefix = a.prefix = qName.slice(0,nsp); |
| var localName = qName.slice(nsp+1); |
| var nsPrefix = prefix === 'xmlns' && localName |
| }else{ |
| localName = qName; |
| prefix = null |
| nsPrefix = qName === 'xmlns' && '' |
| } |
| //can not set prefix,because prefix !== '' |
| a.localName = localName ; |
| //prefix == null for no ns prefix attribute |
| if(nsPrefix !== false){//hack!! |
| if(localNSMap == null){ |
| localNSMap = {} |
| //console.log(currentNSMap,0) |
| _copy(currentNSMap,currentNSMap={}) |
| //console.log(currentNSMap,1) |
| } |
| currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value; |
| a.uri = 'http://www.w3.org/2000/xmlns/' |
| domBuilder.startPrefixMapping(nsPrefix, value) |
| } |
| } |
| var i = el.length; |
| while(i--){ |
| a = el[i]; |
| var prefix = a.prefix; |
| if(prefix){//no prefix attribute has no namespace |
| if(prefix === 'xml'){ |
| a.uri = 'http://www.w3.org/XML/1998/namespace'; |
| }if(prefix !== 'xmlns'){ |
| a.uri = currentNSMap[prefix || ''] |
| |
| //{console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)} |
| } |
| } |
| } |
| var nsp = tagName.indexOf(':'); |
| if(nsp>0){ |
| prefix = el.prefix = tagName.slice(0,nsp); |
| localName = el.localName = tagName.slice(nsp+1); |
| }else{ |
| prefix = null;//important!! |
| localName = el.localName = tagName; |
| } |
| //no prefix element has default namespace |
| var ns = el.uri = currentNSMap[prefix || '']; |
| domBuilder.startElement(ns,localName,tagName,el); |
| //endPrefixMapping and startPrefixMapping have not any help for dom builder |
| //localNSMap = null |
| if(el.closed){ |
| domBuilder.endElement(ns,localName,tagName); |
| if(localNSMap){ |
| for(prefix in localNSMap){ |
| domBuilder.endPrefixMapping(prefix) |
| } |
| } |
| }else{ |
| el.currentNSMap = currentNSMap; |
| el.localNSMap = localNSMap; |
| //parseStack.push(el); |
| return true; |
| } |
| } |
| function parseHtmlSpecialContent(source,elStartEnd,tagName,entityReplacer,domBuilder){ |
| if(/^(?:script|textarea)$/i.test(tagName)){ |
| var elEndStart = source.indexOf('</'+tagName+'>',elStartEnd); |
| var text = source.substring(elStartEnd+1,elEndStart); |
| if(/[&<]/.test(text)){ |
| if(/^script$/i.test(tagName)){ |
| //if(!/\]\]>/.test(text)){ |
| //lexHandler.startCDATA(); |
| domBuilder.characters(text,0,text.length); |
| //lexHandler.endCDATA(); |
| return elEndStart; |
| //} |
| }//}else{//text area |
| text = text.replace(/&#?\w+;/g,entityReplacer); |
| domBuilder.characters(text,0,text.length); |
| return elEndStart; |
| //} |
| |
| } |
| } |
| return elStartEnd+1; |
| } |
| function fixSelfClosed(source,elStartEnd,tagName,closeMap){ |
| //if(tagName in closeMap){ |
| var pos = closeMap[tagName]; |
| if(pos == null){ |
| //console.log(tagName) |
| pos = source.lastIndexOf('</'+tagName+'>') |
| if(pos<elStartEnd){//忘记闭合 |
| pos = source.lastIndexOf('</'+tagName) |
| } |
| closeMap[tagName] =pos |
| } |
| return pos<elStartEnd; |
| //} |
| } |
| function _copy(source,target){ |
| for(var n in source){target[n] = source[n]} |
| } |
| function parseDCC(source,start,domBuilder,errorHandler){//sure start with '<!' |
| var next= source.charAt(start+2) |
| switch(next){ |
| case '-': |
| if(source.charAt(start + 3) === '-'){ |
| var end = source.indexOf('-->',start+4); |
| //append comment source.substring(4,end)//<!-- |
| if(end>start){ |
| domBuilder.comment(source,start+4,end-start-4); |
| return end+3; |
| }else{ |
| errorHandler.error("Unclosed comment"); |
| return -1; |
| } |
| }else{ |
| //error |
| return -1; |
| } |
| default: |
| if(source.substr(start+3,6) == 'CDATA['){ |
| var end = source.indexOf(']]>',start+9); |
| domBuilder.startCDATA(); |
| domBuilder.characters(source,start+9,end-start-9); |
| domBuilder.endCDATA() |
| return end+3; |
| } |
| //<!DOCTYPE |
| //startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId) |
| var matchs = split(source,start); |
| var len = matchs.length; |
| if(len>1 && /!doctype/i.test(matchs[0][0])){ |
| var name = matchs[1][0]; |
| var pubid = len>3 && /^public$/i.test(matchs[2][0]) && matchs[3][0] |
| var sysid = len>4 && matchs[4][0]; |
| var lastMatch = matchs[len-1] |
| domBuilder.startDTD(name,pubid && pubid.replace(/^(['"])(.*?)\1$/,'$2'), |
| sysid && sysid.replace(/^(['"])(.*?)\1$/,'$2')); |
| domBuilder.endDTD(); |
| |
| return lastMatch.index+lastMatch[0].length |
| } |
| } |
| return -1; |
| } |
| |
| |
| |
| function parseInstruction(source,start,domBuilder){ |
| var end = source.indexOf('?>',start); |
| if(end){ |
| var match = source.substring(start,end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/); |
| if(match){ |
| var len = match[0].length; |
| domBuilder.processingInstruction(match[1], match[2]) ; |
| return end+2; |
| }else{//error |
| return -1; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * @param source |
| */ |
| function ElementAttributes(source){ |
| |
| } |
| ElementAttributes.prototype = { |
| setTagName:function(tagName){ |
| if(!tagNamePattern.test(tagName)){ |
| throw new Error('invalid tagName:'+tagName) |
| } |
| this.tagName = tagName |
| }, |
| add:function(qName,value,offset){ |
| if(!tagNamePattern.test(qName)){ |
| throw new Error('invalid attribute:'+qName) |
| } |
| this[this.length++] = {qName:qName,value:value,offset:offset} |
| }, |
| length:0, |
| getLocalName:function(i){return this[i].localName}, |
| getLocator:function(i){return this[i].locator}, |
| getQName:function(i){return this[i].qName}, |
| getURI:function(i){return this[i].uri}, |
| getValue:function(i){return this[i].value} |
| // ,getIndex:function(uri, localName)){ |
| // if(localName){ |
| // |
| // }else{ |
| // var qName = uri |
| // } |
| // }, |
| // getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, |
| // getType:function(uri,localName){} |
| // getType:function(i){}, |
| } |
| |
| |
| |
| |
| function _set_proto_(thiz,parent){ |
| thiz.__proto__ = parent; |
| return thiz; |
| } |
| if(!(_set_proto_({},_set_proto_.prototype) instanceof _set_proto_)){ |
| _set_proto_ = function(thiz,parent){ |
| function p(){}; |
| p.prototype = parent; |
| p = new p(); |
| for(parent in thiz){ |
| p[parent] = thiz[parent]; |
| } |
| return p; |
| } |
| } |
| |
| function split(source,start){ |
| var match; |
| var buf = []; |
| var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g; |
| reg.lastIndex = start; |
| reg.exec(source);//skip < |
| while(match = reg.exec(source)){ |
| buf.push(match); |
| if(match[1])return buf; |
| } |
| } |
| |
| exports.XMLReader = XMLReader; |
| |