| module.exports = Tokenizer; |
| |
| var decodeCodePoint = require("entities/lib/decode_codepoint.js"); |
| var entityMap = require("entities/maps/entities.json"); |
| var legacyMap = require("entities/maps/legacy.json"); |
| var xmlMap = require("entities/maps/xml.json"); |
| |
| var i = 0; |
| |
| var TEXT = i++; |
| var BEFORE_TAG_NAME = i++; //after < |
| var IN_TAG_NAME = i++; |
| var IN_SELF_CLOSING_TAG = i++; |
| var BEFORE_CLOSING_TAG_NAME = i++; |
| var IN_CLOSING_TAG_NAME = i++; |
| var AFTER_CLOSING_TAG_NAME = i++; |
| |
| //attributes |
| var BEFORE_ATTRIBUTE_NAME = i++; |
| var IN_ATTRIBUTE_NAME = i++; |
| var AFTER_ATTRIBUTE_NAME = i++; |
| var BEFORE_ATTRIBUTE_VALUE = i++; |
| var IN_ATTRIBUTE_VALUE_DQ = i++; // " |
| var IN_ATTRIBUTE_VALUE_SQ = i++; // ' |
| var IN_ATTRIBUTE_VALUE_NQ = i++; |
| |
| //declarations |
| var BEFORE_DECLARATION = i++; // ! |
| var IN_DECLARATION = i++; |
| |
| //processing instructions |
| var IN_PROCESSING_INSTRUCTION = i++; // ? |
| |
| //comments |
| var BEFORE_COMMENT = i++; |
| var IN_COMMENT = i++; |
| var AFTER_COMMENT_1 = i++; |
| var AFTER_COMMENT_2 = i++; |
| |
| //cdata |
| var BEFORE_CDATA_1 = i++; // [ |
| var BEFORE_CDATA_2 = i++; // C |
| var BEFORE_CDATA_3 = i++; // D |
| var BEFORE_CDATA_4 = i++; // A |
| var BEFORE_CDATA_5 = i++; // T |
| var BEFORE_CDATA_6 = i++; // A |
| var IN_CDATA = i++; // [ |
| var AFTER_CDATA_1 = i++; // ] |
| var AFTER_CDATA_2 = i++; // ] |
| |
| //special tags |
| var BEFORE_SPECIAL = i++; //S |
| var BEFORE_SPECIAL_END = i++; //S |
| |
| var BEFORE_SCRIPT_1 = i++; //C |
| var BEFORE_SCRIPT_2 = i++; //R |
| var BEFORE_SCRIPT_3 = i++; //I |
| var BEFORE_SCRIPT_4 = i++; //P |
| var BEFORE_SCRIPT_5 = i++; //T |
| var AFTER_SCRIPT_1 = i++; //C |
| var AFTER_SCRIPT_2 = i++; //R |
| var AFTER_SCRIPT_3 = i++; //I |
| var AFTER_SCRIPT_4 = i++; //P |
| var AFTER_SCRIPT_5 = i++; //T |
| |
| var BEFORE_STYLE_1 = i++; //T |
| var BEFORE_STYLE_2 = i++; //Y |
| var BEFORE_STYLE_3 = i++; //L |
| var BEFORE_STYLE_4 = i++; //E |
| var AFTER_STYLE_1 = i++; //T |
| var AFTER_STYLE_2 = i++; //Y |
| var AFTER_STYLE_3 = i++; //L |
| var AFTER_STYLE_4 = i++; //E |
| |
| var BEFORE_ENTITY = i++; //& |
| var BEFORE_NUMERIC_ENTITY = i++; //# |
| var IN_NAMED_ENTITY = i++; |
| var IN_NUMERIC_ENTITY = i++; |
| var IN_HEX_ENTITY = i++; //X |
| |
| var j = 0; |
| |
| var SPECIAL_NONE = j++; |
| var SPECIAL_SCRIPT = j++; |
| var SPECIAL_STYLE = j++; |
| |
| function whitespace(c) { |
| return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; |
| } |
| |
| function ifElseState(upper, SUCCESS, FAILURE) { |
| var lower = upper.toLowerCase(); |
| |
| if (upper === lower) { |
| return function(c) { |
| if (c === lower) { |
| this._state = SUCCESS; |
| } else { |
| this._state = FAILURE; |
| this._index--; |
| } |
| }; |
| } else { |
| return function(c) { |
| if (c === lower || c === upper) { |
| this._state = SUCCESS; |
| } else { |
| this._state = FAILURE; |
| this._index--; |
| } |
| }; |
| } |
| } |
| |
| function consumeSpecialNameChar(upper, NEXT_STATE) { |
| var lower = upper.toLowerCase(); |
| |
| return function(c) { |
| if (c === lower || c === upper) { |
| this._state = NEXT_STATE; |
| } else { |
| this._state = IN_TAG_NAME; |
| this._index--; //consume the token again |
| } |
| }; |
| } |
| |
| function Tokenizer(options, cbs) { |
| this._state = TEXT; |
| this._buffer = ""; |
| this._sectionStart = 0; |
| this._index = 0; |
| this._bufferOffset = 0; //chars removed from _buffer |
| this._baseState = TEXT; |
| this._special = SPECIAL_NONE; |
| this._cbs = cbs; |
| this._running = true; |
| this._ended = false; |
| this._xmlMode = !!(options && options.xmlMode); |
| this._decodeEntities = !!(options && options.decodeEntities); |
| } |
| |
| Tokenizer.prototype._stateText = function(c) { |
| if (c === "<") { |
| if (this._index > this._sectionStart) { |
| this._cbs.ontext(this._getSection()); |
| } |
| this._state = BEFORE_TAG_NAME; |
| this._sectionStart = this._index; |
| } else if ( |
| this._decodeEntities && |
| this._special === SPECIAL_NONE && |
| c === "&" |
| ) { |
| if (this._index > this._sectionStart) { |
| this._cbs.ontext(this._getSection()); |
| } |
| this._baseState = TEXT; |
| this._state = BEFORE_ENTITY; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeTagName = function(c) { |
| if (c === "/") { |
| this._state = BEFORE_CLOSING_TAG_NAME; |
| } else if (c === "<") { |
| this._cbs.ontext(this._getSection()); |
| this._sectionStart = this._index; |
| } else if (c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { |
| this._state = TEXT; |
| } else if (c === "!") { |
| this._state = BEFORE_DECLARATION; |
| this._sectionStart = this._index + 1; |
| } else if (c === "?") { |
| this._state = IN_PROCESSING_INSTRUCTION; |
| this._sectionStart = this._index + 1; |
| } else { |
| this._state = |
| !this._xmlMode && (c === "s" || c === "S") |
| ? BEFORE_SPECIAL |
| : IN_TAG_NAME; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInTagName = function(c) { |
| if (c === "/" || c === ">" || whitespace(c)) { |
| this._emitToken("onopentagname"); |
| this._state = BEFORE_ATTRIBUTE_NAME; |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeCloseingTagName = function(c) { |
| if (whitespace(c)); |
| else if (c === ">") { |
| this._state = TEXT; |
| } else if (this._special !== SPECIAL_NONE) { |
| if (c === "s" || c === "S") { |
| this._state = BEFORE_SPECIAL_END; |
| } else { |
| this._state = TEXT; |
| this._index--; |
| } |
| } else { |
| this._state = IN_CLOSING_TAG_NAME; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInCloseingTagName = function(c) { |
| if (c === ">" || whitespace(c)) { |
| this._emitToken("onclosetag"); |
| this._state = AFTER_CLOSING_TAG_NAME; |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._stateAfterCloseingTagName = function(c) { |
| //skip everything until ">" |
| if (c === ">") { |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeAttributeName = function(c) { |
| if (c === ">") { |
| this._cbs.onopentagend(); |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } else if (c === "/") { |
| this._state = IN_SELF_CLOSING_TAG; |
| } else if (!whitespace(c)) { |
| this._state = IN_ATTRIBUTE_NAME; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInSelfClosingTag = function(c) { |
| if (c === ">") { |
| this._cbs.onselfclosingtag(); |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } else if (!whitespace(c)) { |
| this._state = BEFORE_ATTRIBUTE_NAME; |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInAttributeName = function(c) { |
| if (c === "=" || c === "/" || c === ">" || whitespace(c)) { |
| this._cbs.onattribname(this._getSection()); |
| this._sectionStart = -1; |
| this._state = AFTER_ATTRIBUTE_NAME; |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._stateAfterAttributeName = function(c) { |
| if (c === "=") { |
| this._state = BEFORE_ATTRIBUTE_VALUE; |
| } else if (c === "/" || c === ">") { |
| this._cbs.onattribend(); |
| this._state = BEFORE_ATTRIBUTE_NAME; |
| this._index--; |
| } else if (!whitespace(c)) { |
| this._cbs.onattribend(); |
| this._state = IN_ATTRIBUTE_NAME; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeAttributeValue = function(c) { |
| if (c === '"') { |
| this._state = IN_ATTRIBUTE_VALUE_DQ; |
| this._sectionStart = this._index + 1; |
| } else if (c === "'") { |
| this._state = IN_ATTRIBUTE_VALUE_SQ; |
| this._sectionStart = this._index + 1; |
| } else if (!whitespace(c)) { |
| this._state = IN_ATTRIBUTE_VALUE_NQ; |
| this._sectionStart = this._index; |
| this._index--; //reconsume token |
| } |
| }; |
| |
| Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c) { |
| if (c === '"') { |
| this._emitToken("onattribdata"); |
| this._cbs.onattribend(); |
| this._state = BEFORE_ATTRIBUTE_NAME; |
| } else if (this._decodeEntities && c === "&") { |
| this._emitToken("onattribdata"); |
| this._baseState = this._state; |
| this._state = BEFORE_ENTITY; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c) { |
| if (c === "'") { |
| this._emitToken("onattribdata"); |
| this._cbs.onattribend(); |
| this._state = BEFORE_ATTRIBUTE_NAME; |
| } else if (this._decodeEntities && c === "&") { |
| this._emitToken("onattribdata"); |
| this._baseState = this._state; |
| this._state = BEFORE_ENTITY; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c) { |
| if (whitespace(c) || c === ">") { |
| this._emitToken("onattribdata"); |
| this._cbs.onattribend(); |
| this._state = BEFORE_ATTRIBUTE_NAME; |
| this._index--; |
| } else if (this._decodeEntities && c === "&") { |
| this._emitToken("onattribdata"); |
| this._baseState = this._state; |
| this._state = BEFORE_ENTITY; |
| this._sectionStart = this._index; |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeDeclaration = function(c) { |
| this._state = |
| c === "[" |
| ? BEFORE_CDATA_1 |
| : c === "-" |
| ? BEFORE_COMMENT |
| : IN_DECLARATION; |
| }; |
| |
| Tokenizer.prototype._stateInDeclaration = function(c) { |
| if (c === ">") { |
| this._cbs.ondeclaration(this._getSection()); |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInProcessingInstruction = function(c) { |
| if (c === ">") { |
| this._cbs.onprocessinginstruction(this._getSection()); |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeComment = function(c) { |
| if (c === "-") { |
| this._state = IN_COMMENT; |
| this._sectionStart = this._index + 1; |
| } else { |
| this._state = IN_DECLARATION; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInComment = function(c) { |
| if (c === "-") this._state = AFTER_COMMENT_1; |
| }; |
| |
| Tokenizer.prototype._stateAfterComment1 = function(c) { |
| if (c === "-") { |
| this._state = AFTER_COMMENT_2; |
| } else { |
| this._state = IN_COMMENT; |
| } |
| }; |
| |
| Tokenizer.prototype._stateAfterComment2 = function(c) { |
| if (c === ">") { |
| //remove 2 trailing chars |
| this._cbs.oncomment( |
| this._buffer.substring(this._sectionStart, this._index - 2) |
| ); |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } else if (c !== "-") { |
| this._state = IN_COMMENT; |
| } |
| // else: stay in AFTER_COMMENT_2 (`--->`) |
| }; |
| |
| Tokenizer.prototype._stateBeforeCdata1 = ifElseState( |
| "C", |
| BEFORE_CDATA_2, |
| IN_DECLARATION |
| ); |
| Tokenizer.prototype._stateBeforeCdata2 = ifElseState( |
| "D", |
| BEFORE_CDATA_3, |
| IN_DECLARATION |
| ); |
| Tokenizer.prototype._stateBeforeCdata3 = ifElseState( |
| "A", |
| BEFORE_CDATA_4, |
| IN_DECLARATION |
| ); |
| Tokenizer.prototype._stateBeforeCdata4 = ifElseState( |
| "T", |
| BEFORE_CDATA_5, |
| IN_DECLARATION |
| ); |
| Tokenizer.prototype._stateBeforeCdata5 = ifElseState( |
| "A", |
| BEFORE_CDATA_6, |
| IN_DECLARATION |
| ); |
| |
| Tokenizer.prototype._stateBeforeCdata6 = function(c) { |
| if (c === "[") { |
| this._state = IN_CDATA; |
| this._sectionStart = this._index + 1; |
| } else { |
| this._state = IN_DECLARATION; |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInCdata = function(c) { |
| if (c === "]") this._state = AFTER_CDATA_1; |
| }; |
| |
| Tokenizer.prototype._stateAfterCdata1 = function(c) { |
| if (c === "]") this._state = AFTER_CDATA_2; |
| else this._state = IN_CDATA; |
| }; |
| |
| Tokenizer.prototype._stateAfterCdata2 = function(c) { |
| if (c === ">") { |
| //remove 2 trailing chars |
| this._cbs.oncdata( |
| this._buffer.substring(this._sectionStart, this._index - 2) |
| ); |
| this._state = TEXT; |
| this._sectionStart = this._index + 1; |
| } else if (c !== "]") { |
| this._state = IN_CDATA; |
| } |
| //else: stay in AFTER_CDATA_2 (`]]]>`) |
| }; |
| |
| Tokenizer.prototype._stateBeforeSpecial = function(c) { |
| if (c === "c" || c === "C") { |
| this._state = BEFORE_SCRIPT_1; |
| } else if (c === "t" || c === "T") { |
| this._state = BEFORE_STYLE_1; |
| } else { |
| this._state = IN_TAG_NAME; |
| this._index--; //consume the token again |
| } |
| }; |
| |
| Tokenizer.prototype._stateBeforeSpecialEnd = function(c) { |
| if (this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")) { |
| this._state = AFTER_SCRIPT_1; |
| } else if (this._special === SPECIAL_STYLE && (c === "t" || c === "T")) { |
| this._state = AFTER_STYLE_1; |
| } else this._state = TEXT; |
| }; |
| |
| Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar( |
| "R", |
| BEFORE_SCRIPT_2 |
| ); |
| Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar( |
| "I", |
| BEFORE_SCRIPT_3 |
| ); |
| Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar( |
| "P", |
| BEFORE_SCRIPT_4 |
| ); |
| Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar( |
| "T", |
| BEFORE_SCRIPT_5 |
| ); |
| |
| Tokenizer.prototype._stateBeforeScript5 = function(c) { |
| if (c === "/" || c === ">" || whitespace(c)) { |
| this._special = SPECIAL_SCRIPT; |
| } |
| this._state = IN_TAG_NAME; |
| this._index--; //consume the token again |
| }; |
| |
| Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); |
| Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); |
| Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); |
| Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); |
| |
| Tokenizer.prototype._stateAfterScript5 = function(c) { |
| if (c === ">" || whitespace(c)) { |
| this._special = SPECIAL_NONE; |
| this._state = IN_CLOSING_TAG_NAME; |
| this._sectionStart = this._index - 6; |
| this._index--; //reconsume the token |
| } else this._state = TEXT; |
| }; |
| |
| Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar( |
| "Y", |
| BEFORE_STYLE_2 |
| ); |
| Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar( |
| "L", |
| BEFORE_STYLE_3 |
| ); |
| Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar( |
| "E", |
| BEFORE_STYLE_4 |
| ); |
| |
| Tokenizer.prototype._stateBeforeStyle4 = function(c) { |
| if (c === "/" || c === ">" || whitespace(c)) { |
| this._special = SPECIAL_STYLE; |
| } |
| this._state = IN_TAG_NAME; |
| this._index--; //consume the token again |
| }; |
| |
| Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); |
| Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); |
| Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); |
| |
| Tokenizer.prototype._stateAfterStyle4 = function(c) { |
| if (c === ">" || whitespace(c)) { |
| this._special = SPECIAL_NONE; |
| this._state = IN_CLOSING_TAG_NAME; |
| this._sectionStart = this._index - 5; |
| this._index--; //reconsume the token |
| } else this._state = TEXT; |
| }; |
| |
| Tokenizer.prototype._stateBeforeEntity = ifElseState( |
| "#", |
| BEFORE_NUMERIC_ENTITY, |
| IN_NAMED_ENTITY |
| ); |
| Tokenizer.prototype._stateBeforeNumericEntity = ifElseState( |
| "X", |
| IN_HEX_ENTITY, |
| IN_NUMERIC_ENTITY |
| ); |
| |
| //for entities terminated with a semicolon |
| Tokenizer.prototype._parseNamedEntityStrict = function() { |
| //offset = 1 |
| if (this._sectionStart + 1 < this._index) { |
| var entity = this._buffer.substring( |
| this._sectionStart + 1, |
| this._index |
| ), |
| map = this._xmlMode ? xmlMap : entityMap; |
| |
| if (map.hasOwnProperty(entity)) { |
| this._emitPartial(map[entity]); |
| this._sectionStart = this._index + 1; |
| } |
| } |
| }; |
| |
| //parses legacy entities (without trailing semicolon) |
| Tokenizer.prototype._parseLegacyEntity = function() { |
| var start = this._sectionStart + 1, |
| limit = this._index - start; |
| |
| if (limit > 6) limit = 6; //the max length of legacy entities is 6 |
| |
| while (limit >= 2) { |
| //the min length of legacy entities is 2 |
| var entity = this._buffer.substr(start, limit); |
| |
| if (legacyMap.hasOwnProperty(entity)) { |
| this._emitPartial(legacyMap[entity]); |
| this._sectionStart += limit + 1; |
| return; |
| } else { |
| limit--; |
| } |
| } |
| }; |
| |
| Tokenizer.prototype._stateInNamedEntity = function(c) { |
| if (c === ";") { |
| this._parseNamedEntityStrict(); |
| if (this._sectionStart + 1 < this._index && !this._xmlMode) { |
| this._parseLegacyEntity(); |
| } |
| this._state = this._baseState; |
| } else if ( |
| (c < "a" || c > "z") && |
| (c < "A" || c > "Z") && |
| (c < "0" || c > "9") |
| ) { |
| if (this._xmlMode); |
| else if (this._sectionStart + 1 === this._index); |
| else if (this._baseState !== TEXT) { |
| if (c !== "=") { |
| this._parseNamedEntityStrict(); |
| } |
| } else { |
| this._parseLegacyEntity(); |
| } |
| |
| this._state = this._baseState; |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._decodeNumericEntity = function(offset, base) { |
| var sectionStart = this._sectionStart + offset; |
| |
| if (sectionStart !== this._index) { |
| //parse entity |
| var entity = this._buffer.substring(sectionStart, this._index); |
| var parsed = parseInt(entity, base); |
| |
| this._emitPartial(decodeCodePoint(parsed)); |
| this._sectionStart = this._index; |
| } else { |
| this._sectionStart--; |
| } |
| |
| this._state = this._baseState; |
| }; |
| |
| Tokenizer.prototype._stateInNumericEntity = function(c) { |
| if (c === ";") { |
| this._decodeNumericEntity(2, 10); |
| this._sectionStart++; |
| } else if (c < "0" || c > "9") { |
| if (!this._xmlMode) { |
| this._decodeNumericEntity(2, 10); |
| } else { |
| this._state = this._baseState; |
| } |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._stateInHexEntity = function(c) { |
| if (c === ";") { |
| this._decodeNumericEntity(3, 16); |
| this._sectionStart++; |
| } else if ( |
| (c < "a" || c > "f") && |
| (c < "A" || c > "F") && |
| (c < "0" || c > "9") |
| ) { |
| if (!this._xmlMode) { |
| this._decodeNumericEntity(3, 16); |
| } else { |
| this._state = this._baseState; |
| } |
| this._index--; |
| } |
| }; |
| |
| Tokenizer.prototype._cleanup = function() { |
| if (this._sectionStart < 0) { |
| this._buffer = ""; |
| this._bufferOffset += this._index; |
| this._index = 0; |
| } else if (this._running) { |
| if (this._state === TEXT) { |
| if (this._sectionStart !== this._index) { |
| this._cbs.ontext(this._buffer.substr(this._sectionStart)); |
| } |
| this._buffer = ""; |
| this._bufferOffset += this._index; |
| this._index = 0; |
| } else if (this._sectionStart === this._index) { |
| //the section just started |
| this._buffer = ""; |
| this._bufferOffset += this._index; |
| this._index = 0; |
| } else { |
| //remove everything unnecessary |
| this._buffer = this._buffer.substr(this._sectionStart); |
| this._index -= this._sectionStart; |
| this._bufferOffset += this._sectionStart; |
| } |
| |
| this._sectionStart = 0; |
| } |
| }; |
| |
| //TODO make events conditional |
| Tokenizer.prototype.write = function(chunk) { |
| if (this._ended) this._cbs.onerror(Error(".write() after done!")); |
| |
| this._buffer += chunk; |
| this._parse(); |
| }; |
| |
| Tokenizer.prototype._parse = function() { |
| while (this._index < this._buffer.length && this._running) { |
| var c = this._buffer.charAt(this._index); |
| if (this._state === TEXT) { |
| this._stateText(c); |
| } else if (this._state === BEFORE_TAG_NAME) { |
| this._stateBeforeTagName(c); |
| } else if (this._state === IN_TAG_NAME) { |
| this._stateInTagName(c); |
| } else if (this._state === BEFORE_CLOSING_TAG_NAME) { |
| this._stateBeforeCloseingTagName(c); |
| } else if (this._state === IN_CLOSING_TAG_NAME) { |
| this._stateInCloseingTagName(c); |
| } else if (this._state === AFTER_CLOSING_TAG_NAME) { |
| this._stateAfterCloseingTagName(c); |
| } else if (this._state === IN_SELF_CLOSING_TAG) { |
| this._stateInSelfClosingTag(c); |
| } else if (this._state === BEFORE_ATTRIBUTE_NAME) { |
| |
| /* |
| * attributes |
| */ |
| this._stateBeforeAttributeName(c); |
| } else if (this._state === IN_ATTRIBUTE_NAME) { |
| this._stateInAttributeName(c); |
| } else if (this._state === AFTER_ATTRIBUTE_NAME) { |
| this._stateAfterAttributeName(c); |
| } else if (this._state === BEFORE_ATTRIBUTE_VALUE) { |
| this._stateBeforeAttributeValue(c); |
| } else if (this._state === IN_ATTRIBUTE_VALUE_DQ) { |
| this._stateInAttributeValueDoubleQuotes(c); |
| } else if (this._state === IN_ATTRIBUTE_VALUE_SQ) { |
| this._stateInAttributeValueSingleQuotes(c); |
| } else if (this._state === IN_ATTRIBUTE_VALUE_NQ) { |
| this._stateInAttributeValueNoQuotes(c); |
| } else if (this._state === BEFORE_DECLARATION) { |
| |
| /* |
| * declarations |
| */ |
| this._stateBeforeDeclaration(c); |
| } else if (this._state === IN_DECLARATION) { |
| this._stateInDeclaration(c); |
| } else if (this._state === IN_PROCESSING_INSTRUCTION) { |
| |
| /* |
| * processing instructions |
| */ |
| this._stateInProcessingInstruction(c); |
| } else if (this._state === BEFORE_COMMENT) { |
| |
| /* |
| * comments |
| */ |
| this._stateBeforeComment(c); |
| } else if (this._state === IN_COMMENT) { |
| this._stateInComment(c); |
| } else if (this._state === AFTER_COMMENT_1) { |
| this._stateAfterComment1(c); |
| } else if (this._state === AFTER_COMMENT_2) { |
| this._stateAfterComment2(c); |
| } else if (this._state === BEFORE_CDATA_1) { |
| |
| /* |
| * cdata |
| */ |
| this._stateBeforeCdata1(c); |
| } else if (this._state === BEFORE_CDATA_2) { |
| this._stateBeforeCdata2(c); |
| } else if (this._state === BEFORE_CDATA_3) { |
| this._stateBeforeCdata3(c); |
| } else if (this._state === BEFORE_CDATA_4) { |
| this._stateBeforeCdata4(c); |
| } else if (this._state === BEFORE_CDATA_5) { |
| this._stateBeforeCdata5(c); |
| } else if (this._state === BEFORE_CDATA_6) { |
| this._stateBeforeCdata6(c); |
| } else if (this._state === IN_CDATA) { |
| this._stateInCdata(c); |
| } else if (this._state === AFTER_CDATA_1) { |
| this._stateAfterCdata1(c); |
| } else if (this._state === AFTER_CDATA_2) { |
| this._stateAfterCdata2(c); |
| } else if (this._state === BEFORE_SPECIAL) { |
| |
| /* |
| * special tags |
| */ |
| this._stateBeforeSpecial(c); |
| } else if (this._state === BEFORE_SPECIAL_END) { |
| this._stateBeforeSpecialEnd(c); |
| } else if (this._state === BEFORE_SCRIPT_1) { |
| |
| /* |
| * script |
| */ |
| this._stateBeforeScript1(c); |
| } else if (this._state === BEFORE_SCRIPT_2) { |
| this._stateBeforeScript2(c); |
| } else if (this._state === BEFORE_SCRIPT_3) { |
| this._stateBeforeScript3(c); |
| } else if (this._state === BEFORE_SCRIPT_4) { |
| this._stateBeforeScript4(c); |
| } else if (this._state === BEFORE_SCRIPT_5) { |
| this._stateBeforeScript5(c); |
| } else if (this._state === AFTER_SCRIPT_1) { |
| this._stateAfterScript1(c); |
| } else if (this._state === AFTER_SCRIPT_2) { |
| this._stateAfterScript2(c); |
| } else if (this._state === AFTER_SCRIPT_3) { |
| this._stateAfterScript3(c); |
| } else if (this._state === AFTER_SCRIPT_4) { |
| this._stateAfterScript4(c); |
| } else if (this._state === AFTER_SCRIPT_5) { |
| this._stateAfterScript5(c); |
| } else if (this._state === BEFORE_STYLE_1) { |
| |
| /* |
| * style |
| */ |
| this._stateBeforeStyle1(c); |
| } else if (this._state === BEFORE_STYLE_2) { |
| this._stateBeforeStyle2(c); |
| } else if (this._state === BEFORE_STYLE_3) { |
| this._stateBeforeStyle3(c); |
| } else if (this._state === BEFORE_STYLE_4) { |
| this._stateBeforeStyle4(c); |
| } else if (this._state === AFTER_STYLE_1) { |
| this._stateAfterStyle1(c); |
| } else if (this._state === AFTER_STYLE_2) { |
| this._stateAfterStyle2(c); |
| } else if (this._state === AFTER_STYLE_3) { |
| this._stateAfterStyle3(c); |
| } else if (this._state === AFTER_STYLE_4) { |
| this._stateAfterStyle4(c); |
| } else if (this._state === BEFORE_ENTITY) { |
| |
| /* |
| * entities |
| */ |
| this._stateBeforeEntity(c); |
| } else if (this._state === BEFORE_NUMERIC_ENTITY) { |
| this._stateBeforeNumericEntity(c); |
| } else if (this._state === IN_NAMED_ENTITY) { |
| this._stateInNamedEntity(c); |
| } else if (this._state === IN_NUMERIC_ENTITY) { |
| this._stateInNumericEntity(c); |
| } else if (this._state === IN_HEX_ENTITY) { |
| this._stateInHexEntity(c); |
| } else { |
| this._cbs.onerror(Error("unknown _state"), this._state); |
| } |
| |
| this._index++; |
| } |
| |
| this._cleanup(); |
| }; |
| |
| Tokenizer.prototype.pause = function() { |
| this._running = false; |
| }; |
| Tokenizer.prototype.resume = function() { |
| this._running = true; |
| |
| if (this._index < this._buffer.length) { |
| this._parse(); |
| } |
| if (this._ended) { |
| this._finish(); |
| } |
| }; |
| |
| Tokenizer.prototype.end = function(chunk) { |
| if (this._ended) this._cbs.onerror(Error(".end() after done!")); |
| if (chunk) this.write(chunk); |
| |
| this._ended = true; |
| |
| if (this._running) this._finish(); |
| }; |
| |
| Tokenizer.prototype._finish = function() { |
| //if there is remaining data, emit it in a reasonable way |
| if (this._sectionStart < this._index) { |
| this._handleTrailingData(); |
| } |
| |
| this._cbs.onend(); |
| }; |
| |
| Tokenizer.prototype._handleTrailingData = function() { |
| var data = this._buffer.substr(this._sectionStart); |
| |
| if ( |
| this._state === IN_CDATA || |
| this._state === AFTER_CDATA_1 || |
| this._state === AFTER_CDATA_2 |
| ) { |
| this._cbs.oncdata(data); |
| } else if ( |
| this._state === IN_COMMENT || |
| this._state === AFTER_COMMENT_1 || |
| this._state === AFTER_COMMENT_2 |
| ) { |
| this._cbs.oncomment(data); |
| } else if (this._state === IN_NAMED_ENTITY && !this._xmlMode) { |
| this._parseLegacyEntity(); |
| if (this._sectionStart < this._index) { |
| this._state = this._baseState; |
| this._handleTrailingData(); |
| } |
| } else if (this._state === IN_NUMERIC_ENTITY && !this._xmlMode) { |
| this._decodeNumericEntity(2, 10); |
| if (this._sectionStart < this._index) { |
| this._state = this._baseState; |
| this._handleTrailingData(); |
| } |
| } else if (this._state === IN_HEX_ENTITY && !this._xmlMode) { |
| this._decodeNumericEntity(3, 16); |
| if (this._sectionStart < this._index) { |
| this._state = this._baseState; |
| this._handleTrailingData(); |
| } |
| } else if ( |
| this._state !== IN_TAG_NAME && |
| this._state !== BEFORE_ATTRIBUTE_NAME && |
| this._state !== BEFORE_ATTRIBUTE_VALUE && |
| this._state !== AFTER_ATTRIBUTE_NAME && |
| this._state !== IN_ATTRIBUTE_NAME && |
| this._state !== IN_ATTRIBUTE_VALUE_SQ && |
| this._state !== IN_ATTRIBUTE_VALUE_DQ && |
| this._state !== IN_ATTRIBUTE_VALUE_NQ && |
| this._state !== IN_CLOSING_TAG_NAME |
| ) { |
| this._cbs.ontext(data); |
| } |
| //else, ignore remaining data |
| //TODO add a way to remove current tag |
| }; |
| |
| Tokenizer.prototype.reset = function() { |
| Tokenizer.call( |
| this, |
| { xmlMode: this._xmlMode, decodeEntities: this._decodeEntities }, |
| this._cbs |
| ); |
| }; |
| |
| Tokenizer.prototype.getAbsoluteIndex = function() { |
| return this._bufferOffset + this._index; |
| }; |
| |
| Tokenizer.prototype._getSection = function() { |
| return this._buffer.substring(this._sectionStart, this._index); |
| }; |
| |
| Tokenizer.prototype._emitToken = function(name) { |
| this._cbs[name](this._getSection()); |
| this._sectionStart = -1; |
| }; |
| |
| Tokenizer.prototype._emitPartial = function(value) { |
| if (this._baseState !== TEXT) { |
| this._cbs.onattribdata(value); //TODO implement the new event |
| } else { |
| this._cbs.ontext(value); |
| } |
| }; |