index.js 105 KB


  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", { value: true });
  3. exports.Tokenizer = exports.TokenizerMode = void 0;
  4. const preprocessor_js_1 = require("./preprocessor.js");
  5. const unicode_js_1 = require("../common/unicode.js");
  6. const token_js_1 = require("../common/token.js");
  7. const decode_js_1 = require("entities/lib/decode.js");
  8. const error_codes_js_1 = require("../common/error-codes.js");
  9. const html_js_1 = require("../common/html.js");
  10. //States
  11. var State;
  12. (function (State) {
  13. State[State["DATA"] = 0] = "DATA";
  14. State[State["RCDATA"] = 1] = "RCDATA";
  15. State[State["RAWTEXT"] = 2] = "RAWTEXT";
  16. State[State["SCRIPT_DATA"] = 3] = "SCRIPT_DATA";
  17. State[State["PLAINTEXT"] = 4] = "PLAINTEXT";
  18. State[State["TAG_OPEN"] = 5] = "TAG_OPEN";
  19. State[State["END_TAG_OPEN"] = 6] = "END_TAG_OPEN";
  20. State[State["TAG_NAME"] = 7] = "TAG_NAME";
  21. State[State["RCDATA_LESS_THAN_SIGN"] = 8] = "RCDATA_LESS_THAN_SIGN";
  22. State[State["RCDATA_END_TAG_OPEN"] = 9] = "RCDATA_END_TAG_OPEN";
  23. State[State["RCDATA_END_TAG_NAME"] = 10] = "RCDATA_END_TAG_NAME";
  24. State[State["RAWTEXT_LESS_THAN_SIGN"] = 11] = "RAWTEXT_LESS_THAN_SIGN";
  25. State[State["RAWTEXT_END_TAG_OPEN"] = 12] = "RAWTEXT_END_TAG_OPEN";
  26. State[State["RAWTEXT_END_TAG_NAME"] = 13] = "RAWTEXT_END_TAG_NAME";
  27. State[State["SCRIPT_DATA_LESS_THAN_SIGN"] = 14] = "SCRIPT_DATA_LESS_THAN_SIGN";
  28. State[State["SCRIPT_DATA_END_TAG_OPEN"] = 15] = "SCRIPT_DATA_END_TAG_OPEN";
  29. State[State["SCRIPT_DATA_END_TAG_NAME"] = 16] = "SCRIPT_DATA_END_TAG_NAME";
  30. State[State["SCRIPT_DATA_ESCAPE_START"] = 17] = "SCRIPT_DATA_ESCAPE_START";
  31. State[State["SCRIPT_DATA_ESCAPE_START_DASH"] = 18] = "SCRIPT_DATA_ESCAPE_START_DASH";
  32. State[State["SCRIPT_DATA_ESCAPED"] = 19] = "SCRIPT_DATA_ESCAPED";
  33. State[State["SCRIPT_DATA_ESCAPED_DASH"] = 20] = "SCRIPT_DATA_ESCAPED_DASH";
  34. State[State["SCRIPT_DATA_ESCAPED_DASH_DASH"] = 21] = "SCRIPT_DATA_ESCAPED_DASH_DASH";
  35. State[State["SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN"] = 22] = "SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN";
  36. State[State["SCRIPT_DATA_ESCAPED_END_TAG_OPEN"] = 23] = "SCRIPT_DATA_ESCAPED_END_TAG_OPEN";
  37. State[State["SCRIPT_DATA_ESCAPED_END_TAG_NAME"] = 24] = "SCRIPT_DATA_ESCAPED_END_TAG_NAME";
  38. State[State["SCRIPT_DATA_DOUBLE_ESCAPE_START"] = 25] = "SCRIPT_DATA_DOUBLE_ESCAPE_START";
  39. State[State["SCRIPT_DATA_DOUBLE_ESCAPED"] = 26] = "SCRIPT_DATA_DOUBLE_ESCAPED";
  40. State[State["SCRIPT_DATA_DOUBLE_ESCAPED_DASH"] = 27] = "SCRIPT_DATA_DOUBLE_ESCAPED_DASH";
  41. State[State["SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH"] = 28] = "SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH";
  42. State[State["SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN"] = 29] = "SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN";
  43. State[State["SCRIPT_DATA_DOUBLE_ESCAPE_END"] = 30] = "SCRIPT_DATA_DOUBLE_ESCAPE_END";
  44. State[State["BEFORE_ATTRIBUTE_NAME"] = 31] = "BEFORE_ATTRIBUTE_NAME";
  45. State[State["ATTRIBUTE_NAME"] = 32] = "ATTRIBUTE_NAME";
  46. State[State["AFTER_ATTRIBUTE_NAME"] = 33] = "AFTER_ATTRIBUTE_NAME";
  47. State[State["BEFORE_ATTRIBUTE_VALUE"] = 34] = "BEFORE_ATTRIBUTE_VALUE";
  48. State[State["ATTRIBUTE_VALUE_DOUBLE_QUOTED"] = 35] = "ATTRIBUTE_VALUE_DOUBLE_QUOTED";
  49. State[State["ATTRIBUTE_VALUE_SINGLE_QUOTED"] = 36] = "ATTRIBUTE_VALUE_SINGLE_QUOTED";
  50. State[State["ATTRIBUTE_VALUE_UNQUOTED"] = 37] = "ATTRIBUTE_VALUE_UNQUOTED";
  51. State[State["AFTER_ATTRIBUTE_VALUE_QUOTED"] = 38] = "AFTER_ATTRIBUTE_VALUE_QUOTED";
  52. State[State["SELF_CLOSING_START_TAG"] = 39] = "SELF_CLOSING_START_TAG";
  53. State[State["BOGUS_COMMENT"] = 40] = "BOGUS_COMMENT";
  54. State[State["MARKUP_DECLARATION_OPEN"] = 41] = "MARKUP_DECLARATION_OPEN";
  55. State[State["COMMENT_START"] = 42] = "COMMENT_START";
  56. State[State["COMMENT_START_DASH"] = 43] = "COMMENT_START_DASH";
  57. State[State["COMMENT"] = 44] = "COMMENT";
  58. State[State["COMMENT_LESS_THAN_SIGN"] = 45] = "COMMENT_LESS_THAN_SIGN";
  59. State[State["COMMENT_LESS_THAN_SIGN_BANG"] = 46] = "COMMENT_LESS_THAN_SIGN_BANG";
  60. State[State["COMMENT_LESS_THAN_SIGN_BANG_DASH"] = 47] = "COMMENT_LESS_THAN_SIGN_BANG_DASH";
  61. State[State["COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH"] = 48] = "COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH";
  62. State[State["COMMENT_END_DASH"] = 49] = "COMMENT_END_DASH";
  63. State[State["COMMENT_END"] = 50] = "COMMENT_END";
  64. State[State["COMMENT_END_BANG"] = 51] = "COMMENT_END_BANG";
  65. State[State["DOCTYPE"] = 52] = "DOCTYPE";
  66. State[State["BEFORE_DOCTYPE_NAME"] = 53] = "BEFORE_DOCTYPE_NAME";
  67. State[State["DOCTYPE_NAME"] = 54] = "DOCTYPE_NAME";
  68. State[State["AFTER_DOCTYPE_NAME"] = 55] = "AFTER_DOCTYPE_NAME";
  69. State[State["AFTER_DOCTYPE_PUBLIC_KEYWORD"] = 56] = "AFTER_DOCTYPE_PUBLIC_KEYWORD";
  70. State[State["BEFORE_DOCTYPE_PUBLIC_IDENTIFIER"] = 57] = "BEFORE_DOCTYPE_PUBLIC_IDENTIFIER";
  71. State[State["DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED"] = 58] = "DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED";
  72. State[State["DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED"] = 59] = "DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED";
  73. State[State["AFTER_DOCTYPE_PUBLIC_IDENTIFIER"] = 60] = "AFTER_DOCTYPE_PUBLIC_IDENTIFIER";
  74. State[State["BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS"] = 61] = "BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS";
  75. State[State["AFTER_DOCTYPE_SYSTEM_KEYWORD"] = 62] = "AFTER_DOCTYPE_SYSTEM_KEYWORD";
  76. State[State["BEFORE_DOCTYPE_SYSTEM_IDENTIFIER"] = 63] = "BEFORE_DOCTYPE_SYSTEM_IDENTIFIER";
  77. State[State["DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED"] = 64] = "DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED";
  78. State[State["DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED"] = 65] = "DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED";
  79. State[State["AFTER_DOCTYPE_SYSTEM_IDENTIFIER"] = 66] = "AFTER_DOCTYPE_SYSTEM_IDENTIFIER";
  80. State[State["BOGUS_DOCTYPE"] = 67] = "BOGUS_DOCTYPE";
  81. State[State["CDATA_SECTION"] = 68] = "CDATA_SECTION";
  82. State[State["CDATA_SECTION_BRACKET"] = 69] = "CDATA_SECTION_BRACKET";
  83. State[State["CDATA_SECTION_END"] = 70] = "CDATA_SECTION_END";
  84. State[State["CHARACTER_REFERENCE"] = 71] = "CHARACTER_REFERENCE";
  85. State[State["AMBIGUOUS_AMPERSAND"] = 72] = "AMBIGUOUS_AMPERSAND";
  86. })(State || (State = {}));
  87. //Tokenizer initial states for different modes
  88. exports.TokenizerMode = {
  89. DATA: State.DATA,
  90. RCDATA: State.RCDATA,
  91. RAWTEXT: State.RAWTEXT,
  92. SCRIPT_DATA: State.SCRIPT_DATA,
  93. PLAINTEXT: State.PLAINTEXT,
  94. CDATA_SECTION: State.CDATA_SECTION,
  95. };
  96. //Utils
  97. //OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
  98. //this functions if they will be situated in another module due to context switch.
  99. //Always perform inlining check before modifying this functions ('node --trace-inlining').
  100. function isAsciiDigit(cp) {
  101. return cp >= unicode_js_1.CODE_POINTS.DIGIT_0 && cp <= unicode_js_1.CODE_POINTS.DIGIT_9;
  102. }
  103. function isAsciiUpper(cp) {
  104. return cp >= unicode_js_1.CODE_POINTS.LATIN_CAPITAL_A && cp <= unicode_js_1.CODE_POINTS.LATIN_CAPITAL_Z;
  105. }
  106. function isAsciiLower(cp) {
  107. return cp >= unicode_js_1.CODE_POINTS.LATIN_SMALL_A && cp <= unicode_js_1.CODE_POINTS.LATIN_SMALL_Z;
  108. }
  109. function isAsciiLetter(cp) {
  110. return isAsciiLower(cp) || isAsciiUpper(cp);
  111. }
  112. function isAsciiAlphaNumeric(cp) {
  113. return isAsciiLetter(cp) || isAsciiDigit(cp);
  114. }
  115. function toAsciiLower(cp) {
  116. return cp + 32;
  117. }
  118. function isWhitespace(cp) {
  119. return cp === unicode_js_1.CODE_POINTS.SPACE || cp === unicode_js_1.CODE_POINTS.LINE_FEED || cp === unicode_js_1.CODE_POINTS.TABULATION || cp === unicode_js_1.CODE_POINTS.FORM_FEED;
  120. }
  121. function isScriptDataDoubleEscapeSequenceEnd(cp) {
  122. return isWhitespace(cp) || cp === unicode_js_1.CODE_POINTS.SOLIDUS || cp === unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN;
  123. }
  124. function getErrorForNumericCharacterReference(code) {
  125. if (code === unicode_js_1.CODE_POINTS.NULL) {
  126. return error_codes_js_1.ERR.nullCharacterReference;
  127. }
  128. else if (code > 1114111) {
  129. return error_codes_js_1.ERR.characterReferenceOutsideUnicodeRange;
  130. }
  131. else if ((0, unicode_js_1.isSurrogate)(code)) {
  132. return error_codes_js_1.ERR.surrogateCharacterReference;
  133. }
  134. else if ((0, unicode_js_1.isUndefinedCodePoint)(code)) {
  135. return error_codes_js_1.ERR.noncharacterCharacterReference;
  136. }
  137. else if ((0, unicode_js_1.isControlCodePoint)(code) || code === unicode_js_1.CODE_POINTS.CARRIAGE_RETURN) {
  138. return error_codes_js_1.ERR.controlCharacterReference;
  139. }
  140. return null;
  141. }
  142. //Tokenizer
  143. class Tokenizer {
  144. constructor(options, handler) {
  145. this.options = options;
  146. this.handler = handler;
  147. this.paused = false;
  148. /** Ensures that the parsing loop isn't run multiple times at once. */
  149. this.inLoop = false;
  150. /**
  151. * Indicates that the current adjusted node exists, is not an element in the HTML namespace,
  152. * and that it is not an integration point for either MathML or HTML.
  153. *
  154. * @see {@link https://html.spec.whatwg.org/multipage/parsing.html#tree-construction}
  155. */
  156. this.inForeignNode = false;
  157. this.lastStartTagName = '';
  158. this.active = false;
  159. this.state = State.DATA;
  160. this.returnState = State.DATA;
  161. this.entityStartPos = 0;
  162. this.consumedAfterSnapshot = -1;
  163. this.currentCharacterToken = null;
  164. this.currentToken = null;
  165. this.currentAttr = { name: '', value: '' };
  166. this.preprocessor = new preprocessor_js_1.Preprocessor(handler);
  167. this.currentLocation = this.getCurrentLocation(-1);
  168. this.entityDecoder = new decode_js_1.EntityDecoder(decode_js_1.htmlDecodeTree, (cp, consumed) => {
  169. // Note: Set `pos` _before_ flushing, as flushing might drop
  170. // the current chunk and invalidate `entityStartPos`.
  171. this.preprocessor.pos = this.entityStartPos + consumed - 1;
  172. this._flushCodePointConsumedAsCharacterReference(cp);
  173. }, handler.onParseError
  174. ? {
  175. missingSemicolonAfterCharacterReference: () => {
  176. this._err(error_codes_js_1.ERR.missingSemicolonAfterCharacterReference, 1);
  177. },
  178. absenceOfDigitsInNumericCharacterReference: (consumed) => {
  179. this._err(error_codes_js_1.ERR.absenceOfDigitsInNumericCharacterReference, this.entityStartPos - this.preprocessor.pos + consumed);
  180. },
  181. validateNumericCharacterReference: (code) => {
  182. const error = getErrorForNumericCharacterReference(code);
  183. if (error)
  184. this._err(error, 1);
  185. },
  186. }
  187. : undefined);
  188. }
  189. //Errors
  190. _err(code, cpOffset = 0) {
  191. var _a, _b;
  192. (_b = (_a = this.handler).onParseError) === null || _b === void 0 ? void 0 : _b.call(_a, this.preprocessor.getError(code, cpOffset));
  193. }
  194. // NOTE: `offset` may never run across line boundaries.
  195. getCurrentLocation(offset) {
  196. if (!this.options.sourceCodeLocationInfo) {
  197. return null;
  198. }
  199. return {
  200. startLine: this.preprocessor.line,
  201. startCol: this.preprocessor.col - offset,
  202. startOffset: this.preprocessor.offset - offset,
  203. endLine: -1,
  204. endCol: -1,
  205. endOffset: -1,
  206. };
  207. }
  208. _runParsingLoop() {
  209. if (this.inLoop)
  210. return;
  211. this.inLoop = true;
  212. while (this.active && !this.paused) {
  213. this.consumedAfterSnapshot = 0;
  214. const cp = this._consume();
  215. if (!this._ensureHibernation()) {
  216. this._callState(cp);
  217. }
  218. }
  219. this.inLoop = false;
  220. }
  221. //API
  222. pause() {
  223. this.paused = true;
  224. }
  225. resume(writeCallback) {
  226. if (!this.paused) {
  227. throw new Error('Parser was already resumed');
  228. }
  229. this.paused = false;
  230. // Necessary for synchronous resume.
  231. if (this.inLoop)
  232. return;
  233. this._runParsingLoop();
  234. if (!this.paused) {
  235. writeCallback === null || writeCallback === void 0 ? void 0 : writeCallback();
  236. }
  237. }
  238. write(chunk, isLastChunk, writeCallback) {
  239. this.active = true;
  240. this.preprocessor.write(chunk, isLastChunk);
  241. this._runParsingLoop();
  242. if (!this.paused) {
  243. writeCallback === null || writeCallback === void 0 ? void 0 : writeCallback();
  244. }
  245. }
  246. insertHtmlAtCurrentPos(chunk) {
  247. this.active = true;
  248. this.preprocessor.insertHtmlAtCurrentPos(chunk);
  249. this._runParsingLoop();
  250. }
  251. //Hibernation
  252. _ensureHibernation() {
  253. if (this.preprocessor.endOfChunkHit) {
  254. this.preprocessor.retreat(this.consumedAfterSnapshot);
  255. this.consumedAfterSnapshot = 0;
  256. this.active = false;
  257. return true;
  258. }
  259. return false;
  260. }
  261. //Consumption
  262. _consume() {
  263. this.consumedAfterSnapshot++;
  264. return this.preprocessor.advance();
  265. }
  266. _advanceBy(count) {
  267. this.consumedAfterSnapshot += count;
  268. for (let i = 0; i < count; i++) {
  269. this.preprocessor.advance();
  270. }
  271. }
  272. _consumeSequenceIfMatch(pattern, caseSensitive) {
  273. if (this.preprocessor.startsWith(pattern, caseSensitive)) {
  274. // We will already have consumed one character before calling this method.
  275. this._advanceBy(pattern.length - 1);
  276. return true;
  277. }
  278. return false;
  279. }
  280. //Token creation
  281. _createStartTagToken() {
  282. this.currentToken = {
  283. type: token_js_1.TokenType.START_TAG,
  284. tagName: '',
  285. tagID: html_js_1.TAG_ID.UNKNOWN,
  286. selfClosing: false,
  287. ackSelfClosing: false,
  288. attrs: [],
  289. location: this.getCurrentLocation(1),
  290. };
  291. }
  292. _createEndTagToken() {
  293. this.currentToken = {
  294. type: token_js_1.TokenType.END_TAG,
  295. tagName: '',
  296. tagID: html_js_1.TAG_ID.UNKNOWN,
  297. selfClosing: false,
  298. ackSelfClosing: false,
  299. attrs: [],
  300. location: this.getCurrentLocation(2),
  301. };
  302. }
  303. _createCommentToken(offset) {
  304. this.currentToken = {
  305. type: token_js_1.TokenType.COMMENT,
  306. data: '',
  307. location: this.getCurrentLocation(offset),
  308. };
  309. }
  310. _createDoctypeToken(initialName) {
  311. this.currentToken = {
  312. type: token_js_1.TokenType.DOCTYPE,
  313. name: initialName,
  314. forceQuirks: false,
  315. publicId: null,
  316. systemId: null,
  317. location: this.currentLocation,
  318. };
  319. }
  320. _createCharacterToken(type, chars) {
  321. this.currentCharacterToken = {
  322. type,
  323. chars,
  324. location: this.currentLocation,
  325. };
  326. }
  327. //Tag attributes
  328. _createAttr(attrNameFirstCh) {
  329. this.currentAttr = {
  330. name: attrNameFirstCh,
  331. value: '',
  332. };
  333. this.currentLocation = this.getCurrentLocation(0);
  334. }
  335. _leaveAttrName() {
  336. var _a;
  337. var _b;
  338. const token = this.currentToken;
  339. if ((0, token_js_1.getTokenAttr)(token, this.currentAttr.name) === null) {
  340. token.attrs.push(this.currentAttr);
  341. if (token.location && this.currentLocation) {
  342. const attrLocations = ((_a = (_b = token.location).attrs) !== null && _a !== void 0 ? _a : (_b.attrs = Object.create(null)));
  343. attrLocations[this.currentAttr.name] = this.currentLocation;
  344. // Set end location
  345. this._leaveAttrValue();
  346. }
  347. }
  348. else {
  349. this._err(error_codes_js_1.ERR.duplicateAttribute);
  350. }
  351. }
  352. _leaveAttrValue() {
  353. if (this.currentLocation) {
  354. this.currentLocation.endLine = this.preprocessor.line;
  355. this.currentLocation.endCol = this.preprocessor.col;
  356. this.currentLocation.endOffset = this.preprocessor.offset;
  357. }
  358. }
  359. //Token emission
  360. prepareToken(ct) {
  361. this._emitCurrentCharacterToken(ct.location);
  362. this.currentToken = null;
  363. if (ct.location) {
  364. ct.location.endLine = this.preprocessor.line;
  365. ct.location.endCol = this.preprocessor.col + 1;
  366. ct.location.endOffset = this.preprocessor.offset + 1;
  367. }
  368. this.currentLocation = this.getCurrentLocation(-1);
  369. }
  370. emitCurrentTagToken() {
  371. const ct = this.currentToken;
  372. this.prepareToken(ct);
  373. ct.tagID = (0, html_js_1.getTagID)(ct.tagName);
  374. if (ct.type === token_js_1.TokenType.START_TAG) {
  375. this.lastStartTagName = ct.tagName;
  376. this.handler.onStartTag(ct);
  377. }
  378. else {
  379. if (ct.attrs.length > 0) {
  380. this._err(error_codes_js_1.ERR.endTagWithAttributes);
  381. }
  382. if (ct.selfClosing) {
  383. this._err(error_codes_js_1.ERR.endTagWithTrailingSolidus);
  384. }
  385. this.handler.onEndTag(ct);
  386. }
  387. this.preprocessor.dropParsedChunk();
  388. }
  389. emitCurrentComment(ct) {
  390. this.prepareToken(ct);
  391. this.handler.onComment(ct);
  392. this.preprocessor.dropParsedChunk();
  393. }
  394. emitCurrentDoctype(ct) {
  395. this.prepareToken(ct);
  396. this.handler.onDoctype(ct);
  397. this.preprocessor.dropParsedChunk();
  398. }
  399. _emitCurrentCharacterToken(nextLocation) {
  400. if (this.currentCharacterToken) {
  401. //NOTE: if we have a pending character token, make it's end location equal to the
  402. //current token's start location.
  403. if (nextLocation && this.currentCharacterToken.location) {
  404. this.currentCharacterToken.location.endLine = nextLocation.startLine;
  405. this.currentCharacterToken.location.endCol = nextLocation.startCol;
  406. this.currentCharacterToken.location.endOffset = nextLocation.startOffset;
  407. }
  408. switch (this.currentCharacterToken.type) {
  409. case token_js_1.TokenType.CHARACTER: {
  410. this.handler.onCharacter(this.currentCharacterToken);
  411. break;
  412. }
  413. case token_js_1.TokenType.NULL_CHARACTER: {
  414. this.handler.onNullCharacter(this.currentCharacterToken);
  415. break;
  416. }
  417. case token_js_1.TokenType.WHITESPACE_CHARACTER: {
  418. this.handler.onWhitespaceCharacter(this.currentCharacterToken);
  419. break;
  420. }
  421. }
  422. this.currentCharacterToken = null;
  423. }
  424. }
  425. _emitEOFToken() {
  426. const location = this.getCurrentLocation(0);
  427. if (location) {
  428. location.endLine = location.startLine;
  429. location.endCol = location.startCol;
  430. location.endOffset = location.startOffset;
  431. }
  432. this._emitCurrentCharacterToken(location);
  433. this.handler.onEof({ type: token_js_1.TokenType.EOF, location });
  434. this.active = false;
  435. }
  436. //Characters emission
  437. //OPTIMIZATION: The specification uses only one type of character token (one token per character).
  438. //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters.
  439. //If we have a sequence of characters that belong to the same group, the parser can process it
  440. //as a single solid character token.
  441. //So, there are 3 types of character tokens in parse5:
  442. //1)TokenType.NULL_CHARACTER - \u0000-character sequences (e.g. '\u0000\u0000\u0000')
  443. //2)TokenType.WHITESPACE_CHARACTER - any whitespace/new-line character sequences (e.g. '\n \r\t \f')
  444. //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
  445. _appendCharToCurrentCharacterToken(type, ch) {
  446. if (this.currentCharacterToken) {
  447. if (this.currentCharacterToken.type === type) {
  448. this.currentCharacterToken.chars += ch;
  449. return;
  450. }
  451. else {
  452. this.currentLocation = this.getCurrentLocation(0);
  453. this._emitCurrentCharacterToken(this.currentLocation);
  454. this.preprocessor.dropParsedChunk();
  455. }
  456. }
  457. this._createCharacterToken(type, ch);
  458. }
  459. _emitCodePoint(cp) {
  460. const type = isWhitespace(cp)
  461. ? token_js_1.TokenType.WHITESPACE_CHARACTER
  462. : cp === unicode_js_1.CODE_POINTS.NULL
  463. ? token_js_1.TokenType.NULL_CHARACTER
  464. : token_js_1.TokenType.CHARACTER;
  465. this._appendCharToCurrentCharacterToken(type, String.fromCodePoint(cp));
  466. }
  467. //NOTE: used when we emit characters explicitly.
  468. //This is always for non-whitespace and non-null characters, which allows us to avoid additional checks.
  469. _emitChars(ch) {
  470. this._appendCharToCurrentCharacterToken(token_js_1.TokenType.CHARACTER, ch);
  471. }
  472. // Character reference helpers
  473. _startCharacterReference() {
  474. this.returnState = this.state;
  475. this.state = State.CHARACTER_REFERENCE;
  476. this.entityStartPos = this.preprocessor.pos;
  477. this.entityDecoder.startEntity(this._isCharacterReferenceInAttribute() ? decode_js_1.DecodingMode.Attribute : decode_js_1.DecodingMode.Legacy);
  478. }
  479. _isCharacterReferenceInAttribute() {
  480. return (this.returnState === State.ATTRIBUTE_VALUE_DOUBLE_QUOTED ||
  481. this.returnState === State.ATTRIBUTE_VALUE_SINGLE_QUOTED ||
  482. this.returnState === State.ATTRIBUTE_VALUE_UNQUOTED);
  483. }
  484. _flushCodePointConsumedAsCharacterReference(cp) {
  485. if (this._isCharacterReferenceInAttribute()) {
  486. this.currentAttr.value += String.fromCodePoint(cp);
  487. }
  488. else {
  489. this._emitCodePoint(cp);
  490. }
  491. }
  492. // Calling states this way turns out to be much faster than any other approach.
  493. _callState(cp) {
  494. switch (this.state) {
  495. case State.DATA: {
  496. this._stateData(cp);
  497. break;
  498. }
  499. case State.RCDATA: {
  500. this._stateRcdata(cp);
  501. break;
  502. }
  503. case State.RAWTEXT: {
  504. this._stateRawtext(cp);
  505. break;
  506. }
  507. case State.SCRIPT_DATA: {
  508. this._stateScriptData(cp);
  509. break;
  510. }
  511. case State.PLAINTEXT: {
  512. this._statePlaintext(cp);
  513. break;
  514. }
  515. case State.TAG_OPEN: {
  516. this._stateTagOpen(cp);
  517. break;
  518. }
  519. case State.END_TAG_OPEN: {
  520. this._stateEndTagOpen(cp);
  521. break;
  522. }
  523. case State.TAG_NAME: {
  524. this._stateTagName(cp);
  525. break;
  526. }
  527. case State.RCDATA_LESS_THAN_SIGN: {
  528. this._stateRcdataLessThanSign(cp);
  529. break;
  530. }
  531. case State.RCDATA_END_TAG_OPEN: {
  532. this._stateRcdataEndTagOpen(cp);
  533. break;
  534. }
  535. case State.RCDATA_END_TAG_NAME: {
  536. this._stateRcdataEndTagName(cp);
  537. break;
  538. }
  539. case State.RAWTEXT_LESS_THAN_SIGN: {
  540. this._stateRawtextLessThanSign(cp);
  541. break;
  542. }
  543. case State.RAWTEXT_END_TAG_OPEN: {
  544. this._stateRawtextEndTagOpen(cp);
  545. break;
  546. }
  547. case State.RAWTEXT_END_TAG_NAME: {
  548. this._stateRawtextEndTagName(cp);
  549. break;
  550. }
  551. case State.SCRIPT_DATA_LESS_THAN_SIGN: {
  552. this._stateScriptDataLessThanSign(cp);
  553. break;
  554. }
  555. case State.SCRIPT_DATA_END_TAG_OPEN: {
  556. this._stateScriptDataEndTagOpen(cp);
  557. break;
  558. }
  559. case State.SCRIPT_DATA_END_TAG_NAME: {
  560. this._stateScriptDataEndTagName(cp);
  561. break;
  562. }
  563. case State.SCRIPT_DATA_ESCAPE_START: {
  564. this._stateScriptDataEscapeStart(cp);
  565. break;
  566. }
  567. case State.SCRIPT_DATA_ESCAPE_START_DASH: {
  568. this._stateScriptDataEscapeStartDash(cp);
  569. break;
  570. }
  571. case State.SCRIPT_DATA_ESCAPED: {
  572. this._stateScriptDataEscaped(cp);
  573. break;
  574. }
  575. case State.SCRIPT_DATA_ESCAPED_DASH: {
  576. this._stateScriptDataEscapedDash(cp);
  577. break;
  578. }
  579. case State.SCRIPT_DATA_ESCAPED_DASH_DASH: {
  580. this._stateScriptDataEscapedDashDash(cp);
  581. break;
  582. }
  583. case State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: {
  584. this._stateScriptDataEscapedLessThanSign(cp);
  585. break;
  586. }
  587. case State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN: {
  588. this._stateScriptDataEscapedEndTagOpen(cp);
  589. break;
  590. }
  591. case State.SCRIPT_DATA_ESCAPED_END_TAG_NAME: {
  592. this._stateScriptDataEscapedEndTagName(cp);
  593. break;
  594. }
  595. case State.SCRIPT_DATA_DOUBLE_ESCAPE_START: {
  596. this._stateScriptDataDoubleEscapeStart(cp);
  597. break;
  598. }
  599. case State.SCRIPT_DATA_DOUBLE_ESCAPED: {
  600. this._stateScriptDataDoubleEscaped(cp);
  601. break;
  602. }
  603. case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH: {
  604. this._stateScriptDataDoubleEscapedDash(cp);
  605. break;
  606. }
  607. case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: {
  608. this._stateScriptDataDoubleEscapedDashDash(cp);
  609. break;
  610. }
  611. case State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: {
  612. this._stateScriptDataDoubleEscapedLessThanSign(cp);
  613. break;
  614. }
  615. case State.SCRIPT_DATA_DOUBLE_ESCAPE_END: {
  616. this._stateScriptDataDoubleEscapeEnd(cp);
  617. break;
  618. }
  619. case State.BEFORE_ATTRIBUTE_NAME: {
  620. this._stateBeforeAttributeName(cp);
  621. break;
  622. }
  623. case State.ATTRIBUTE_NAME: {
  624. this._stateAttributeName(cp);
  625. break;
  626. }
  627. case State.AFTER_ATTRIBUTE_NAME: {
  628. this._stateAfterAttributeName(cp);
  629. break;
  630. }
  631. case State.BEFORE_ATTRIBUTE_VALUE: {
  632. this._stateBeforeAttributeValue(cp);
  633. break;
  634. }
  635. case State.ATTRIBUTE_VALUE_DOUBLE_QUOTED: {
  636. this._stateAttributeValueDoubleQuoted(cp);
  637. break;
  638. }
  639. case State.ATTRIBUTE_VALUE_SINGLE_QUOTED: {
  640. this._stateAttributeValueSingleQuoted(cp);
  641. break;
  642. }
  643. case State.ATTRIBUTE_VALUE_UNQUOTED: {
  644. this._stateAttributeValueUnquoted(cp);
  645. break;
  646. }
  647. case State.AFTER_ATTRIBUTE_VALUE_QUOTED: {
  648. this._stateAfterAttributeValueQuoted(cp);
  649. break;
  650. }
  651. case State.SELF_CLOSING_START_TAG: {
  652. this._stateSelfClosingStartTag(cp);
  653. break;
  654. }
  655. case State.BOGUS_COMMENT: {
  656. this._stateBogusComment(cp);
  657. break;
  658. }
  659. case State.MARKUP_DECLARATION_OPEN: {
  660. this._stateMarkupDeclarationOpen(cp);
  661. break;
  662. }
  663. case State.COMMENT_START: {
  664. this._stateCommentStart(cp);
  665. break;
  666. }
  667. case State.COMMENT_START_DASH: {
  668. this._stateCommentStartDash(cp);
  669. break;
  670. }
  671. case State.COMMENT: {
  672. this._stateComment(cp);
  673. break;
  674. }
  675. case State.COMMENT_LESS_THAN_SIGN: {
  676. this._stateCommentLessThanSign(cp);
  677. break;
  678. }
  679. case State.COMMENT_LESS_THAN_SIGN_BANG: {
  680. this._stateCommentLessThanSignBang(cp);
  681. break;
  682. }
  683. case State.COMMENT_LESS_THAN_SIGN_BANG_DASH: {
  684. this._stateCommentLessThanSignBangDash(cp);
  685. break;
  686. }
  687. case State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: {
  688. this._stateCommentLessThanSignBangDashDash(cp);
  689. break;
  690. }
  691. case State.COMMENT_END_DASH: {
  692. this._stateCommentEndDash(cp);
  693. break;
  694. }
  695. case State.COMMENT_END: {
  696. this._stateCommentEnd(cp);
  697. break;
  698. }
  699. case State.COMMENT_END_BANG: {
  700. this._stateCommentEndBang(cp);
  701. break;
  702. }
  703. case State.DOCTYPE: {
  704. this._stateDoctype(cp);
  705. break;
  706. }
  707. case State.BEFORE_DOCTYPE_NAME: {
  708. this._stateBeforeDoctypeName(cp);
  709. break;
  710. }
  711. case State.DOCTYPE_NAME: {
  712. this._stateDoctypeName(cp);
  713. break;
  714. }
  715. case State.AFTER_DOCTYPE_NAME: {
  716. this._stateAfterDoctypeName(cp);
  717. break;
  718. }
  719. case State.AFTER_DOCTYPE_PUBLIC_KEYWORD: {
  720. this._stateAfterDoctypePublicKeyword(cp);
  721. break;
  722. }
  723. case State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: {
  724. this._stateBeforeDoctypePublicIdentifier(cp);
  725. break;
  726. }
  727. case State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: {
  728. this._stateDoctypePublicIdentifierDoubleQuoted(cp);
  729. break;
  730. }
  731. case State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: {
  732. this._stateDoctypePublicIdentifierSingleQuoted(cp);
  733. break;
  734. }
  735. case State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER: {
  736. this._stateAfterDoctypePublicIdentifier(cp);
  737. break;
  738. }
  739. case State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: {
  740. this._stateBetweenDoctypePublicAndSystemIdentifiers(cp);
  741. break;
  742. }
  743. case State.AFTER_DOCTYPE_SYSTEM_KEYWORD: {
  744. this._stateAfterDoctypeSystemKeyword(cp);
  745. break;
  746. }
  747. case State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: {
  748. this._stateBeforeDoctypeSystemIdentifier(cp);
  749. break;
  750. }
  751. case State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: {
  752. this._stateDoctypeSystemIdentifierDoubleQuoted(cp);
  753. break;
  754. }
  755. case State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: {
  756. this._stateDoctypeSystemIdentifierSingleQuoted(cp);
  757. break;
  758. }
  759. case State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER: {
  760. this._stateAfterDoctypeSystemIdentifier(cp);
  761. break;
  762. }
  763. case State.BOGUS_DOCTYPE: {
  764. this._stateBogusDoctype(cp);
  765. break;
  766. }
  767. case State.CDATA_SECTION: {
  768. this._stateCdataSection(cp);
  769. break;
  770. }
  771. case State.CDATA_SECTION_BRACKET: {
  772. this._stateCdataSectionBracket(cp);
  773. break;
  774. }
  775. case State.CDATA_SECTION_END: {
  776. this._stateCdataSectionEnd(cp);
  777. break;
  778. }
  779. case State.CHARACTER_REFERENCE: {
  780. this._stateCharacterReference();
  781. break;
  782. }
  783. case State.AMBIGUOUS_AMPERSAND: {
  784. this._stateAmbiguousAmpersand(cp);
  785. break;
  786. }
  787. default: {
  788. throw new Error('Unknown state');
  789. }
  790. }
  791. }
  792. // State machine
  793. // Data state
  794. //------------------------------------------------------------------
  795. _stateData(cp) {
  796. switch (cp) {
  797. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  798. this.state = State.TAG_OPEN;
  799. break;
  800. }
  801. case unicode_js_1.CODE_POINTS.AMPERSAND: {
  802. this._startCharacterReference();
  803. break;
  804. }
  805. case unicode_js_1.CODE_POINTS.NULL: {
  806. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  807. this._emitCodePoint(cp);
  808. break;
  809. }
  810. case unicode_js_1.CODE_POINTS.EOF: {
  811. this._emitEOFToken();
  812. break;
  813. }
  814. default: {
  815. this._emitCodePoint(cp);
  816. }
  817. }
  818. }
  819. // RCDATA state
  820. //------------------------------------------------------------------
  821. _stateRcdata(cp) {
  822. switch (cp) {
  823. case unicode_js_1.CODE_POINTS.AMPERSAND: {
  824. this._startCharacterReference();
  825. break;
  826. }
  827. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  828. this.state = State.RCDATA_LESS_THAN_SIGN;
  829. break;
  830. }
  831. case unicode_js_1.CODE_POINTS.NULL: {
  832. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  833. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  834. break;
  835. }
  836. case unicode_js_1.CODE_POINTS.EOF: {
  837. this._emitEOFToken();
  838. break;
  839. }
  840. default: {
  841. this._emitCodePoint(cp);
  842. }
  843. }
  844. }
  845. // RAWTEXT state
  846. //------------------------------------------------------------------
  847. _stateRawtext(cp) {
  848. switch (cp) {
  849. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  850. this.state = State.RAWTEXT_LESS_THAN_SIGN;
  851. break;
  852. }
  853. case unicode_js_1.CODE_POINTS.NULL: {
  854. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  855. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  856. break;
  857. }
  858. case unicode_js_1.CODE_POINTS.EOF: {
  859. this._emitEOFToken();
  860. break;
  861. }
  862. default: {
  863. this._emitCodePoint(cp);
  864. }
  865. }
  866. }
  867. // Script data state
  868. //------------------------------------------------------------------
  869. _stateScriptData(cp) {
  870. switch (cp) {
  871. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  872. this.state = State.SCRIPT_DATA_LESS_THAN_SIGN;
  873. break;
  874. }
  875. case unicode_js_1.CODE_POINTS.NULL: {
  876. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  877. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  878. break;
  879. }
  880. case unicode_js_1.CODE_POINTS.EOF: {
  881. this._emitEOFToken();
  882. break;
  883. }
  884. default: {
  885. this._emitCodePoint(cp);
  886. }
  887. }
  888. }
  889. // PLAINTEXT state
  890. //------------------------------------------------------------------
  891. _statePlaintext(cp) {
  892. switch (cp) {
  893. case unicode_js_1.CODE_POINTS.NULL: {
  894. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  895. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  896. break;
  897. }
  898. case unicode_js_1.CODE_POINTS.EOF: {
  899. this._emitEOFToken();
  900. break;
  901. }
  902. default: {
  903. this._emitCodePoint(cp);
  904. }
  905. }
  906. }
  907. // Tag open state
  908. //------------------------------------------------------------------
  909. _stateTagOpen(cp) {
  910. if (isAsciiLetter(cp)) {
  911. this._createStartTagToken();
  912. this.state = State.TAG_NAME;
  913. this._stateTagName(cp);
  914. }
  915. else
  916. switch (cp) {
  917. case unicode_js_1.CODE_POINTS.EXCLAMATION_MARK: {
  918. this.state = State.MARKUP_DECLARATION_OPEN;
  919. break;
  920. }
  921. case unicode_js_1.CODE_POINTS.SOLIDUS: {
  922. this.state = State.END_TAG_OPEN;
  923. break;
  924. }
  925. case unicode_js_1.CODE_POINTS.QUESTION_MARK: {
  926. this._err(error_codes_js_1.ERR.unexpectedQuestionMarkInsteadOfTagName);
  927. this._createCommentToken(1);
  928. this.state = State.BOGUS_COMMENT;
  929. this._stateBogusComment(cp);
  930. break;
  931. }
  932. case unicode_js_1.CODE_POINTS.EOF: {
  933. this._err(error_codes_js_1.ERR.eofBeforeTagName);
  934. this._emitChars('<');
  935. this._emitEOFToken();
  936. break;
  937. }
  938. default: {
  939. this._err(error_codes_js_1.ERR.invalidFirstCharacterOfTagName);
  940. this._emitChars('<');
  941. this.state = State.DATA;
  942. this._stateData(cp);
  943. }
  944. }
  945. }
  946. // End tag open state
  947. //------------------------------------------------------------------
  948. _stateEndTagOpen(cp) {
  949. if (isAsciiLetter(cp)) {
  950. this._createEndTagToken();
  951. this.state = State.TAG_NAME;
  952. this._stateTagName(cp);
  953. }
  954. else
  955. switch (cp) {
  956. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  957. this._err(error_codes_js_1.ERR.missingEndTagName);
  958. this.state = State.DATA;
  959. break;
  960. }
  961. case unicode_js_1.CODE_POINTS.EOF: {
  962. this._err(error_codes_js_1.ERR.eofBeforeTagName);
  963. this._emitChars('</');
  964. this._emitEOFToken();
  965. break;
  966. }
  967. default: {
  968. this._err(error_codes_js_1.ERR.invalidFirstCharacterOfTagName);
  969. this._createCommentToken(2);
  970. this.state = State.BOGUS_COMMENT;
  971. this._stateBogusComment(cp);
  972. }
  973. }
  974. }
  975. // Tag name state
  976. //------------------------------------------------------------------
  977. _stateTagName(cp) {
  978. const token = this.currentToken;
  979. switch (cp) {
  980. case unicode_js_1.CODE_POINTS.SPACE:
  981. case unicode_js_1.CODE_POINTS.LINE_FEED:
  982. case unicode_js_1.CODE_POINTS.TABULATION:
  983. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  984. this.state = State.BEFORE_ATTRIBUTE_NAME;
  985. break;
  986. }
  987. case unicode_js_1.CODE_POINTS.SOLIDUS: {
  988. this.state = State.SELF_CLOSING_START_TAG;
  989. break;
  990. }
  991. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  992. this.state = State.DATA;
  993. this.emitCurrentTagToken();
  994. break;
  995. }
  996. case unicode_js_1.CODE_POINTS.NULL: {
  997. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  998. token.tagName += unicode_js_1.REPLACEMENT_CHARACTER;
  999. break;
  1000. }
  1001. case unicode_js_1.CODE_POINTS.EOF: {
  1002. this._err(error_codes_js_1.ERR.eofInTag);
  1003. this._emitEOFToken();
  1004. break;
  1005. }
  1006. default: {
  1007. token.tagName += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp);
  1008. }
  1009. }
  1010. }
  1011. // RCDATA less-than sign state
  1012. //------------------------------------------------------------------
  1013. _stateRcdataLessThanSign(cp) {
  1014. if (cp === unicode_js_1.CODE_POINTS.SOLIDUS) {
  1015. this.state = State.RCDATA_END_TAG_OPEN;
  1016. }
  1017. else {
  1018. this._emitChars('<');
  1019. this.state = State.RCDATA;
  1020. this._stateRcdata(cp);
  1021. }
  1022. }
  1023. // RCDATA end tag open state
  1024. //------------------------------------------------------------------
  1025. _stateRcdataEndTagOpen(cp) {
  1026. if (isAsciiLetter(cp)) {
  1027. this.state = State.RCDATA_END_TAG_NAME;
  1028. this._stateRcdataEndTagName(cp);
  1029. }
  1030. else {
  1031. this._emitChars('</');
  1032. this.state = State.RCDATA;
  1033. this._stateRcdata(cp);
  1034. }
  1035. }
  1036. handleSpecialEndTag(_cp) {
  1037. if (!this.preprocessor.startsWith(this.lastStartTagName, false)) {
  1038. return !this._ensureHibernation();
  1039. }
  1040. this._createEndTagToken();
  1041. const token = this.currentToken;
  1042. token.tagName = this.lastStartTagName;
  1043. const cp = this.preprocessor.peek(this.lastStartTagName.length);
  1044. switch (cp) {
  1045. case unicode_js_1.CODE_POINTS.SPACE:
  1046. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1047. case unicode_js_1.CODE_POINTS.TABULATION:
  1048. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  1049. this._advanceBy(this.lastStartTagName.length);
  1050. this.state = State.BEFORE_ATTRIBUTE_NAME;
  1051. return false;
  1052. }
  1053. case unicode_js_1.CODE_POINTS.SOLIDUS: {
  1054. this._advanceBy(this.lastStartTagName.length);
  1055. this.state = State.SELF_CLOSING_START_TAG;
  1056. return false;
  1057. }
  1058. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1059. this._advanceBy(this.lastStartTagName.length);
  1060. this.emitCurrentTagToken();
  1061. this.state = State.DATA;
  1062. return false;
  1063. }
  1064. default: {
  1065. return !this._ensureHibernation();
  1066. }
  1067. }
  1068. }
  1069. // RCDATA end tag name state
  1070. //------------------------------------------------------------------
  1071. _stateRcdataEndTagName(cp) {
  1072. if (this.handleSpecialEndTag(cp)) {
  1073. this._emitChars('</');
  1074. this.state = State.RCDATA;
  1075. this._stateRcdata(cp);
  1076. }
  1077. }
  1078. // RAWTEXT less-than sign state
  1079. //------------------------------------------------------------------
  1080. _stateRawtextLessThanSign(cp) {
  1081. if (cp === unicode_js_1.CODE_POINTS.SOLIDUS) {
  1082. this.state = State.RAWTEXT_END_TAG_OPEN;
  1083. }
  1084. else {
  1085. this._emitChars('<');
  1086. this.state = State.RAWTEXT;
  1087. this._stateRawtext(cp);
  1088. }
  1089. }
  1090. // RAWTEXT end tag open state
  1091. //------------------------------------------------------------------
  1092. _stateRawtextEndTagOpen(cp) {
  1093. if (isAsciiLetter(cp)) {
  1094. this.state = State.RAWTEXT_END_TAG_NAME;
  1095. this._stateRawtextEndTagName(cp);
  1096. }
  1097. else {
  1098. this._emitChars('</');
  1099. this.state = State.RAWTEXT;
  1100. this._stateRawtext(cp);
  1101. }
  1102. }
  1103. // RAWTEXT end tag name state
  1104. //------------------------------------------------------------------
  1105. _stateRawtextEndTagName(cp) {
  1106. if (this.handleSpecialEndTag(cp)) {
  1107. this._emitChars('</');
  1108. this.state = State.RAWTEXT;
  1109. this._stateRawtext(cp);
  1110. }
  1111. }
  1112. // Script data less-than sign state
  1113. //------------------------------------------------------------------
  1114. _stateScriptDataLessThanSign(cp) {
  1115. switch (cp) {
  1116. case unicode_js_1.CODE_POINTS.SOLIDUS: {
  1117. this.state = State.SCRIPT_DATA_END_TAG_OPEN;
  1118. break;
  1119. }
  1120. case unicode_js_1.CODE_POINTS.EXCLAMATION_MARK: {
  1121. this.state = State.SCRIPT_DATA_ESCAPE_START;
  1122. this._emitChars('<!');
  1123. break;
  1124. }
  1125. default: {
  1126. this._emitChars('<');
  1127. this.state = State.SCRIPT_DATA;
  1128. this._stateScriptData(cp);
  1129. }
  1130. }
  1131. }
  1132. // Script data end tag open state
  1133. //------------------------------------------------------------------
  1134. _stateScriptDataEndTagOpen(cp) {
  1135. if (isAsciiLetter(cp)) {
  1136. this.state = State.SCRIPT_DATA_END_TAG_NAME;
  1137. this._stateScriptDataEndTagName(cp);
  1138. }
  1139. else {
  1140. this._emitChars('</');
  1141. this.state = State.SCRIPT_DATA;
  1142. this._stateScriptData(cp);
  1143. }
  1144. }
  1145. // Script data end tag name state
  1146. //------------------------------------------------------------------
  1147. _stateScriptDataEndTagName(cp) {
  1148. if (this.handleSpecialEndTag(cp)) {
  1149. this._emitChars('</');
  1150. this.state = State.SCRIPT_DATA;
  1151. this._stateScriptData(cp);
  1152. }
  1153. }
  1154. // Script data escape start state
  1155. //------------------------------------------------------------------
  1156. _stateScriptDataEscapeStart(cp) {
  1157. if (cp === unicode_js_1.CODE_POINTS.HYPHEN_MINUS) {
  1158. this.state = State.SCRIPT_DATA_ESCAPE_START_DASH;
  1159. this._emitChars('-');
  1160. }
  1161. else {
  1162. this.state = State.SCRIPT_DATA;
  1163. this._stateScriptData(cp);
  1164. }
  1165. }
  1166. // Script data escape start dash state
  1167. //------------------------------------------------------------------
  1168. _stateScriptDataEscapeStartDash(cp) {
  1169. if (cp === unicode_js_1.CODE_POINTS.HYPHEN_MINUS) {
  1170. this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH;
  1171. this._emitChars('-');
  1172. }
  1173. else {
  1174. this.state = State.SCRIPT_DATA;
  1175. this._stateScriptData(cp);
  1176. }
  1177. }
  1178. // Script data escaped state
  1179. //------------------------------------------------------------------
  1180. _stateScriptDataEscaped(cp) {
  1181. switch (cp) {
  1182. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1183. this.state = State.SCRIPT_DATA_ESCAPED_DASH;
  1184. this._emitChars('-');
  1185. break;
  1186. }
  1187. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1188. this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
  1189. break;
  1190. }
  1191. case unicode_js_1.CODE_POINTS.NULL: {
  1192. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1193. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  1194. break;
  1195. }
  1196. case unicode_js_1.CODE_POINTS.EOF: {
  1197. this._err(error_codes_js_1.ERR.eofInScriptHtmlCommentLikeText);
  1198. this._emitEOFToken();
  1199. break;
  1200. }
  1201. default: {
  1202. this._emitCodePoint(cp);
  1203. }
  1204. }
  1205. }
  1206. // Script data escaped dash state
  1207. //------------------------------------------------------------------
  1208. _stateScriptDataEscapedDash(cp) {
  1209. switch (cp) {
  1210. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1211. this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH;
  1212. this._emitChars('-');
  1213. break;
  1214. }
  1215. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1216. this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
  1217. break;
  1218. }
  1219. case unicode_js_1.CODE_POINTS.NULL: {
  1220. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1221. this.state = State.SCRIPT_DATA_ESCAPED;
  1222. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  1223. break;
  1224. }
  1225. case unicode_js_1.CODE_POINTS.EOF: {
  1226. this._err(error_codes_js_1.ERR.eofInScriptHtmlCommentLikeText);
  1227. this._emitEOFToken();
  1228. break;
  1229. }
  1230. default: {
  1231. this.state = State.SCRIPT_DATA_ESCAPED;
  1232. this._emitCodePoint(cp);
  1233. }
  1234. }
  1235. }
  1236. // Script data escaped dash dash state
  1237. //------------------------------------------------------------------
  1238. _stateScriptDataEscapedDashDash(cp) {
  1239. switch (cp) {
  1240. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1241. this._emitChars('-');
  1242. break;
  1243. }
  1244. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1245. this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
  1246. break;
  1247. }
  1248. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1249. this.state = State.SCRIPT_DATA;
  1250. this._emitChars('>');
  1251. break;
  1252. }
  1253. case unicode_js_1.CODE_POINTS.NULL: {
  1254. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1255. this.state = State.SCRIPT_DATA_ESCAPED;
  1256. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  1257. break;
  1258. }
  1259. case unicode_js_1.CODE_POINTS.EOF: {
  1260. this._err(error_codes_js_1.ERR.eofInScriptHtmlCommentLikeText);
  1261. this._emitEOFToken();
  1262. break;
  1263. }
  1264. default: {
  1265. this.state = State.SCRIPT_DATA_ESCAPED;
  1266. this._emitCodePoint(cp);
  1267. }
  1268. }
  1269. }
  1270. // Script data escaped less-than sign state
  1271. //------------------------------------------------------------------
  1272. _stateScriptDataEscapedLessThanSign(cp) {
  1273. if (cp === unicode_js_1.CODE_POINTS.SOLIDUS) {
  1274. this.state = State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN;
  1275. }
  1276. else if (isAsciiLetter(cp)) {
  1277. this._emitChars('<');
  1278. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPE_START;
  1279. this._stateScriptDataDoubleEscapeStart(cp);
  1280. }
  1281. else {
  1282. this._emitChars('<');
  1283. this.state = State.SCRIPT_DATA_ESCAPED;
  1284. this._stateScriptDataEscaped(cp);
  1285. }
  1286. }
  1287. // Script data escaped end tag open state
  1288. //------------------------------------------------------------------
  1289. _stateScriptDataEscapedEndTagOpen(cp) {
  1290. if (isAsciiLetter(cp)) {
  1291. this.state = State.SCRIPT_DATA_ESCAPED_END_TAG_NAME;
  1292. this._stateScriptDataEscapedEndTagName(cp);
  1293. }
  1294. else {
  1295. this._emitChars('</');
  1296. this.state = State.SCRIPT_DATA_ESCAPED;
  1297. this._stateScriptDataEscaped(cp);
  1298. }
  1299. }
  1300. // Script data escaped end tag name state
  1301. //------------------------------------------------------------------
  1302. _stateScriptDataEscapedEndTagName(cp) {
  1303. if (this.handleSpecialEndTag(cp)) {
  1304. this._emitChars('</');
  1305. this.state = State.SCRIPT_DATA_ESCAPED;
  1306. this._stateScriptDataEscaped(cp);
  1307. }
  1308. }
  1309. // Script data double escape start state
  1310. //------------------------------------------------------------------
  1311. _stateScriptDataDoubleEscapeStart(cp) {
  1312. if (this.preprocessor.startsWith(unicode_js_1.SEQUENCES.SCRIPT, false) &&
  1313. isScriptDataDoubleEscapeSequenceEnd(this.preprocessor.peek(unicode_js_1.SEQUENCES.SCRIPT.length))) {
  1314. this._emitCodePoint(cp);
  1315. for (let i = 0; i < unicode_js_1.SEQUENCES.SCRIPT.length; i++) {
  1316. this._emitCodePoint(this._consume());
  1317. }
  1318. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1319. }
  1320. else if (!this._ensureHibernation()) {
  1321. this.state = State.SCRIPT_DATA_ESCAPED;
  1322. this._stateScriptDataEscaped(cp);
  1323. }
  1324. }
  1325. // Script data double escaped state
  1326. //------------------------------------------------------------------
  1327. _stateScriptDataDoubleEscaped(cp) {
  1328. switch (cp) {
  1329. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1330. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH;
  1331. this._emitChars('-');
  1332. break;
  1333. }
  1334. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1335. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
  1336. this._emitChars('<');
  1337. break;
  1338. }
  1339. case unicode_js_1.CODE_POINTS.NULL: {
  1340. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1341. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  1342. break;
  1343. }
  1344. case unicode_js_1.CODE_POINTS.EOF: {
  1345. this._err(error_codes_js_1.ERR.eofInScriptHtmlCommentLikeText);
  1346. this._emitEOFToken();
  1347. break;
  1348. }
  1349. default: {
  1350. this._emitCodePoint(cp);
  1351. }
  1352. }
  1353. }
  1354. // Script data double escaped dash state
  1355. //------------------------------------------------------------------
  1356. _stateScriptDataDoubleEscapedDash(cp) {
  1357. switch (cp) {
  1358. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1359. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
  1360. this._emitChars('-');
  1361. break;
  1362. }
  1363. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1364. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
  1365. this._emitChars('<');
  1366. break;
  1367. }
  1368. case unicode_js_1.CODE_POINTS.NULL: {
  1369. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1370. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1371. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  1372. break;
  1373. }
  1374. case unicode_js_1.CODE_POINTS.EOF: {
  1375. this._err(error_codes_js_1.ERR.eofInScriptHtmlCommentLikeText);
  1376. this._emitEOFToken();
  1377. break;
  1378. }
  1379. default: {
  1380. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1381. this._emitCodePoint(cp);
  1382. }
  1383. }
  1384. }
  1385. // Script data double escaped dash dash state
  1386. //------------------------------------------------------------------
  1387. _stateScriptDataDoubleEscapedDashDash(cp) {
  1388. switch (cp) {
  1389. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1390. this._emitChars('-');
  1391. break;
  1392. }
  1393. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1394. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
  1395. this._emitChars('<');
  1396. break;
  1397. }
  1398. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1399. this.state = State.SCRIPT_DATA;
  1400. this._emitChars('>');
  1401. break;
  1402. }
  1403. case unicode_js_1.CODE_POINTS.NULL: {
  1404. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1405. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1406. this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER);
  1407. break;
  1408. }
  1409. case unicode_js_1.CODE_POINTS.EOF: {
  1410. this._err(error_codes_js_1.ERR.eofInScriptHtmlCommentLikeText);
  1411. this._emitEOFToken();
  1412. break;
  1413. }
  1414. default: {
  1415. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1416. this._emitCodePoint(cp);
  1417. }
  1418. }
  1419. }
  1420. // Script data double escaped less-than sign state
  1421. //------------------------------------------------------------------
  1422. _stateScriptDataDoubleEscapedLessThanSign(cp) {
  1423. if (cp === unicode_js_1.CODE_POINTS.SOLIDUS) {
  1424. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPE_END;
  1425. this._emitChars('/');
  1426. }
  1427. else {
  1428. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1429. this._stateScriptDataDoubleEscaped(cp);
  1430. }
  1431. }
  1432. // Script data double escape end state
  1433. //------------------------------------------------------------------
  1434. _stateScriptDataDoubleEscapeEnd(cp) {
  1435. if (this.preprocessor.startsWith(unicode_js_1.SEQUENCES.SCRIPT, false) &&
  1436. isScriptDataDoubleEscapeSequenceEnd(this.preprocessor.peek(unicode_js_1.SEQUENCES.SCRIPT.length))) {
  1437. this._emitCodePoint(cp);
  1438. for (let i = 0; i < unicode_js_1.SEQUENCES.SCRIPT.length; i++) {
  1439. this._emitCodePoint(this._consume());
  1440. }
  1441. this.state = State.SCRIPT_DATA_ESCAPED;
  1442. }
  1443. else if (!this._ensureHibernation()) {
  1444. this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED;
  1445. this._stateScriptDataDoubleEscaped(cp);
  1446. }
  1447. }
  1448. // Before attribute name state
  1449. //------------------------------------------------------------------
  1450. _stateBeforeAttributeName(cp) {
  1451. switch (cp) {
  1452. case unicode_js_1.CODE_POINTS.SPACE:
  1453. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1454. case unicode_js_1.CODE_POINTS.TABULATION:
  1455. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  1456. // Ignore whitespace
  1457. break;
  1458. }
  1459. case unicode_js_1.CODE_POINTS.SOLIDUS:
  1460. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN:
  1461. case unicode_js_1.CODE_POINTS.EOF: {
  1462. this.state = State.AFTER_ATTRIBUTE_NAME;
  1463. this._stateAfterAttributeName(cp);
  1464. break;
  1465. }
  1466. case unicode_js_1.CODE_POINTS.EQUALS_SIGN: {
  1467. this._err(error_codes_js_1.ERR.unexpectedEqualsSignBeforeAttributeName);
  1468. this._createAttr('=');
  1469. this.state = State.ATTRIBUTE_NAME;
  1470. break;
  1471. }
  1472. default: {
  1473. this._createAttr('');
  1474. this.state = State.ATTRIBUTE_NAME;
  1475. this._stateAttributeName(cp);
  1476. }
  1477. }
  1478. }
  1479. // Attribute name state
  1480. //------------------------------------------------------------------
  1481. _stateAttributeName(cp) {
  1482. switch (cp) {
  1483. case unicode_js_1.CODE_POINTS.SPACE:
  1484. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1485. case unicode_js_1.CODE_POINTS.TABULATION:
  1486. case unicode_js_1.CODE_POINTS.FORM_FEED:
  1487. case unicode_js_1.CODE_POINTS.SOLIDUS:
  1488. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN:
  1489. case unicode_js_1.CODE_POINTS.EOF: {
  1490. this._leaveAttrName();
  1491. this.state = State.AFTER_ATTRIBUTE_NAME;
  1492. this._stateAfterAttributeName(cp);
  1493. break;
  1494. }
  1495. case unicode_js_1.CODE_POINTS.EQUALS_SIGN: {
  1496. this._leaveAttrName();
  1497. this.state = State.BEFORE_ATTRIBUTE_VALUE;
  1498. break;
  1499. }
  1500. case unicode_js_1.CODE_POINTS.QUOTATION_MARK:
  1501. case unicode_js_1.CODE_POINTS.APOSTROPHE:
  1502. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1503. this._err(error_codes_js_1.ERR.unexpectedCharacterInAttributeName);
  1504. this.currentAttr.name += String.fromCodePoint(cp);
  1505. break;
  1506. }
  1507. case unicode_js_1.CODE_POINTS.NULL: {
  1508. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1509. this.currentAttr.name += unicode_js_1.REPLACEMENT_CHARACTER;
  1510. break;
  1511. }
  1512. default: {
  1513. this.currentAttr.name += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp);
  1514. }
  1515. }
  1516. }
  1517. // After attribute name state
  1518. //------------------------------------------------------------------
  1519. _stateAfterAttributeName(cp) {
  1520. switch (cp) {
  1521. case unicode_js_1.CODE_POINTS.SPACE:
  1522. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1523. case unicode_js_1.CODE_POINTS.TABULATION:
  1524. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  1525. // Ignore whitespace
  1526. break;
  1527. }
  1528. case unicode_js_1.CODE_POINTS.SOLIDUS: {
  1529. this.state = State.SELF_CLOSING_START_TAG;
  1530. break;
  1531. }
  1532. case unicode_js_1.CODE_POINTS.EQUALS_SIGN: {
  1533. this.state = State.BEFORE_ATTRIBUTE_VALUE;
  1534. break;
  1535. }
  1536. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1537. this.state = State.DATA;
  1538. this.emitCurrentTagToken();
  1539. break;
  1540. }
  1541. case unicode_js_1.CODE_POINTS.EOF: {
  1542. this._err(error_codes_js_1.ERR.eofInTag);
  1543. this._emitEOFToken();
  1544. break;
  1545. }
  1546. default: {
  1547. this._createAttr('');
  1548. this.state = State.ATTRIBUTE_NAME;
  1549. this._stateAttributeName(cp);
  1550. }
  1551. }
  1552. }
  1553. // Before attribute value state
  1554. //------------------------------------------------------------------
  1555. _stateBeforeAttributeValue(cp) {
  1556. switch (cp) {
  1557. case unicode_js_1.CODE_POINTS.SPACE:
  1558. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1559. case unicode_js_1.CODE_POINTS.TABULATION:
  1560. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  1561. // Ignore whitespace
  1562. break;
  1563. }
  1564. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  1565. this.state = State.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
  1566. break;
  1567. }
  1568. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  1569. this.state = State.ATTRIBUTE_VALUE_SINGLE_QUOTED;
  1570. break;
  1571. }
  1572. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1573. this._err(error_codes_js_1.ERR.missingAttributeValue);
  1574. this.state = State.DATA;
  1575. this.emitCurrentTagToken();
  1576. break;
  1577. }
  1578. default: {
  1579. this.state = State.ATTRIBUTE_VALUE_UNQUOTED;
  1580. this._stateAttributeValueUnquoted(cp);
  1581. }
  1582. }
  1583. }
  1584. // Attribute value (double-quoted) state
  1585. //------------------------------------------------------------------
  1586. _stateAttributeValueDoubleQuoted(cp) {
  1587. switch (cp) {
  1588. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  1589. this.state = State.AFTER_ATTRIBUTE_VALUE_QUOTED;
  1590. break;
  1591. }
  1592. case unicode_js_1.CODE_POINTS.AMPERSAND: {
  1593. this._startCharacterReference();
  1594. break;
  1595. }
  1596. case unicode_js_1.CODE_POINTS.NULL: {
  1597. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1598. this.currentAttr.value += unicode_js_1.REPLACEMENT_CHARACTER;
  1599. break;
  1600. }
  1601. case unicode_js_1.CODE_POINTS.EOF: {
  1602. this._err(error_codes_js_1.ERR.eofInTag);
  1603. this._emitEOFToken();
  1604. break;
  1605. }
  1606. default: {
  1607. this.currentAttr.value += String.fromCodePoint(cp);
  1608. }
  1609. }
  1610. }
  1611. // Attribute value (single-quoted) state
  1612. //------------------------------------------------------------------
  1613. _stateAttributeValueSingleQuoted(cp) {
  1614. switch (cp) {
  1615. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  1616. this.state = State.AFTER_ATTRIBUTE_VALUE_QUOTED;
  1617. break;
  1618. }
  1619. case unicode_js_1.CODE_POINTS.AMPERSAND: {
  1620. this._startCharacterReference();
  1621. break;
  1622. }
  1623. case unicode_js_1.CODE_POINTS.NULL: {
  1624. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1625. this.currentAttr.value += unicode_js_1.REPLACEMENT_CHARACTER;
  1626. break;
  1627. }
  1628. case unicode_js_1.CODE_POINTS.EOF: {
  1629. this._err(error_codes_js_1.ERR.eofInTag);
  1630. this._emitEOFToken();
  1631. break;
  1632. }
  1633. default: {
  1634. this.currentAttr.value += String.fromCodePoint(cp);
  1635. }
  1636. }
  1637. }
  1638. // Attribute value (unquoted) state
  1639. //------------------------------------------------------------------
  1640. _stateAttributeValueUnquoted(cp) {
  1641. switch (cp) {
  1642. case unicode_js_1.CODE_POINTS.SPACE:
  1643. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1644. case unicode_js_1.CODE_POINTS.TABULATION:
  1645. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  1646. this._leaveAttrValue();
  1647. this.state = State.BEFORE_ATTRIBUTE_NAME;
  1648. break;
  1649. }
  1650. case unicode_js_1.CODE_POINTS.AMPERSAND: {
  1651. this._startCharacterReference();
  1652. break;
  1653. }
  1654. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1655. this._leaveAttrValue();
  1656. this.state = State.DATA;
  1657. this.emitCurrentTagToken();
  1658. break;
  1659. }
  1660. case unicode_js_1.CODE_POINTS.NULL: {
  1661. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1662. this.currentAttr.value += unicode_js_1.REPLACEMENT_CHARACTER;
  1663. break;
  1664. }
  1665. case unicode_js_1.CODE_POINTS.QUOTATION_MARK:
  1666. case unicode_js_1.CODE_POINTS.APOSTROPHE:
  1667. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN:
  1668. case unicode_js_1.CODE_POINTS.EQUALS_SIGN:
  1669. case unicode_js_1.CODE_POINTS.GRAVE_ACCENT: {
  1670. this._err(error_codes_js_1.ERR.unexpectedCharacterInUnquotedAttributeValue);
  1671. this.currentAttr.value += String.fromCodePoint(cp);
  1672. break;
  1673. }
  1674. case unicode_js_1.CODE_POINTS.EOF: {
  1675. this._err(error_codes_js_1.ERR.eofInTag);
  1676. this._emitEOFToken();
  1677. break;
  1678. }
  1679. default: {
  1680. this.currentAttr.value += String.fromCodePoint(cp);
  1681. }
  1682. }
  1683. }
  1684. // After attribute value (quoted) state
  1685. //------------------------------------------------------------------
  1686. _stateAfterAttributeValueQuoted(cp) {
  1687. switch (cp) {
  1688. case unicode_js_1.CODE_POINTS.SPACE:
  1689. case unicode_js_1.CODE_POINTS.LINE_FEED:
  1690. case unicode_js_1.CODE_POINTS.TABULATION:
  1691. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  1692. this._leaveAttrValue();
  1693. this.state = State.BEFORE_ATTRIBUTE_NAME;
  1694. break;
  1695. }
  1696. case unicode_js_1.CODE_POINTS.SOLIDUS: {
  1697. this._leaveAttrValue();
  1698. this.state = State.SELF_CLOSING_START_TAG;
  1699. break;
  1700. }
  1701. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1702. this._leaveAttrValue();
  1703. this.state = State.DATA;
  1704. this.emitCurrentTagToken();
  1705. break;
  1706. }
  1707. case unicode_js_1.CODE_POINTS.EOF: {
  1708. this._err(error_codes_js_1.ERR.eofInTag);
  1709. this._emitEOFToken();
  1710. break;
  1711. }
  1712. default: {
  1713. this._err(error_codes_js_1.ERR.missingWhitespaceBetweenAttributes);
  1714. this.state = State.BEFORE_ATTRIBUTE_NAME;
  1715. this._stateBeforeAttributeName(cp);
  1716. }
  1717. }
  1718. }
  1719. // Self-closing start tag state
  1720. //------------------------------------------------------------------
  1721. _stateSelfClosingStartTag(cp) {
  1722. switch (cp) {
  1723. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1724. const token = this.currentToken;
  1725. token.selfClosing = true;
  1726. this.state = State.DATA;
  1727. this.emitCurrentTagToken();
  1728. break;
  1729. }
  1730. case unicode_js_1.CODE_POINTS.EOF: {
  1731. this._err(error_codes_js_1.ERR.eofInTag);
  1732. this._emitEOFToken();
  1733. break;
  1734. }
  1735. default: {
  1736. this._err(error_codes_js_1.ERR.unexpectedSolidusInTag);
  1737. this.state = State.BEFORE_ATTRIBUTE_NAME;
  1738. this._stateBeforeAttributeName(cp);
  1739. }
  1740. }
  1741. }
  1742. // Bogus comment state
  1743. //------------------------------------------------------------------
  1744. _stateBogusComment(cp) {
  1745. const token = this.currentToken;
  1746. switch (cp) {
  1747. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1748. this.state = State.DATA;
  1749. this.emitCurrentComment(token);
  1750. break;
  1751. }
  1752. case unicode_js_1.CODE_POINTS.EOF: {
  1753. this.emitCurrentComment(token);
  1754. this._emitEOFToken();
  1755. break;
  1756. }
  1757. case unicode_js_1.CODE_POINTS.NULL: {
  1758. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1759. token.data += unicode_js_1.REPLACEMENT_CHARACTER;
  1760. break;
  1761. }
  1762. default: {
  1763. token.data += String.fromCodePoint(cp);
  1764. }
  1765. }
  1766. }
  1767. // Markup declaration open state
  1768. //------------------------------------------------------------------
  1769. _stateMarkupDeclarationOpen(cp) {
  1770. if (this._consumeSequenceIfMatch(unicode_js_1.SEQUENCES.DASH_DASH, true)) {
  1771. this._createCommentToken(unicode_js_1.SEQUENCES.DASH_DASH.length + 1);
  1772. this.state = State.COMMENT_START;
  1773. }
  1774. else if (this._consumeSequenceIfMatch(unicode_js_1.SEQUENCES.DOCTYPE, false)) {
  1775. // NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here.
  1776. this.currentLocation = this.getCurrentLocation(unicode_js_1.SEQUENCES.DOCTYPE.length + 1);
  1777. this.state = State.DOCTYPE;
  1778. }
  1779. else if (this._consumeSequenceIfMatch(unicode_js_1.SEQUENCES.CDATA_START, true)) {
  1780. if (this.inForeignNode) {
  1781. this.state = State.CDATA_SECTION;
  1782. }
  1783. else {
  1784. this._err(error_codes_js_1.ERR.cdataInHtmlContent);
  1785. this._createCommentToken(unicode_js_1.SEQUENCES.CDATA_START.length + 1);
  1786. this.currentToken.data = '[CDATA[';
  1787. this.state = State.BOGUS_COMMENT;
  1788. }
  1789. }
  1790. //NOTE: Sequence lookups can be abrupted by hibernation. In that case, lookup
  1791. //results are no longer valid and we will need to start over.
  1792. else if (!this._ensureHibernation()) {
  1793. this._err(error_codes_js_1.ERR.incorrectlyOpenedComment);
  1794. this._createCommentToken(2);
  1795. this.state = State.BOGUS_COMMENT;
  1796. this._stateBogusComment(cp);
  1797. }
  1798. }
  1799. // Comment start state
  1800. //------------------------------------------------------------------
  1801. _stateCommentStart(cp) {
  1802. switch (cp) {
  1803. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1804. this.state = State.COMMENT_START_DASH;
  1805. break;
  1806. }
  1807. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1808. this._err(error_codes_js_1.ERR.abruptClosingOfEmptyComment);
  1809. this.state = State.DATA;
  1810. const token = this.currentToken;
  1811. this.emitCurrentComment(token);
  1812. break;
  1813. }
  1814. default: {
  1815. this.state = State.COMMENT;
  1816. this._stateComment(cp);
  1817. }
  1818. }
  1819. }
  1820. // Comment start dash state
  1821. //------------------------------------------------------------------
  1822. _stateCommentStartDash(cp) {
  1823. const token = this.currentToken;
  1824. switch (cp) {
  1825. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1826. this.state = State.COMMENT_END;
  1827. break;
  1828. }
  1829. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1830. this._err(error_codes_js_1.ERR.abruptClosingOfEmptyComment);
  1831. this.state = State.DATA;
  1832. this.emitCurrentComment(token);
  1833. break;
  1834. }
  1835. case unicode_js_1.CODE_POINTS.EOF: {
  1836. this._err(error_codes_js_1.ERR.eofInComment);
  1837. this.emitCurrentComment(token);
  1838. this._emitEOFToken();
  1839. break;
  1840. }
  1841. default: {
  1842. token.data += '-';
  1843. this.state = State.COMMENT;
  1844. this._stateComment(cp);
  1845. }
  1846. }
  1847. }
  1848. // Comment state
  1849. //------------------------------------------------------------------
  1850. _stateComment(cp) {
  1851. const token = this.currentToken;
  1852. switch (cp) {
  1853. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1854. this.state = State.COMMENT_END_DASH;
  1855. break;
  1856. }
  1857. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1858. token.data += '<';
  1859. this.state = State.COMMENT_LESS_THAN_SIGN;
  1860. break;
  1861. }
  1862. case unicode_js_1.CODE_POINTS.NULL: {
  1863. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  1864. token.data += unicode_js_1.REPLACEMENT_CHARACTER;
  1865. break;
  1866. }
  1867. case unicode_js_1.CODE_POINTS.EOF: {
  1868. this._err(error_codes_js_1.ERR.eofInComment);
  1869. this.emitCurrentComment(token);
  1870. this._emitEOFToken();
  1871. break;
  1872. }
  1873. default: {
  1874. token.data += String.fromCodePoint(cp);
  1875. }
  1876. }
  1877. }
  1878. // Comment less-than sign state
  1879. //------------------------------------------------------------------
  1880. _stateCommentLessThanSign(cp) {
  1881. const token = this.currentToken;
  1882. switch (cp) {
  1883. case unicode_js_1.CODE_POINTS.EXCLAMATION_MARK: {
  1884. token.data += '!';
  1885. this.state = State.COMMENT_LESS_THAN_SIGN_BANG;
  1886. break;
  1887. }
  1888. case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: {
  1889. token.data += '<';
  1890. break;
  1891. }
  1892. default: {
  1893. this.state = State.COMMENT;
  1894. this._stateComment(cp);
  1895. }
  1896. }
  1897. }
  1898. // Comment less-than sign bang state
  1899. //------------------------------------------------------------------
  1900. _stateCommentLessThanSignBang(cp) {
  1901. if (cp === unicode_js_1.CODE_POINTS.HYPHEN_MINUS) {
  1902. this.state = State.COMMENT_LESS_THAN_SIGN_BANG_DASH;
  1903. }
  1904. else {
  1905. this.state = State.COMMENT;
  1906. this._stateComment(cp);
  1907. }
  1908. }
  1909. // Comment less-than sign bang dash state
  1910. //------------------------------------------------------------------
  1911. _stateCommentLessThanSignBangDash(cp) {
  1912. if (cp === unicode_js_1.CODE_POINTS.HYPHEN_MINUS) {
  1913. this.state = State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH;
  1914. }
  1915. else {
  1916. this.state = State.COMMENT_END_DASH;
  1917. this._stateCommentEndDash(cp);
  1918. }
  1919. }
  1920. // Comment less-than sign bang dash dash state
  1921. //------------------------------------------------------------------
  1922. _stateCommentLessThanSignBangDashDash(cp) {
  1923. if (cp !== unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN && cp !== unicode_js_1.CODE_POINTS.EOF) {
  1924. this._err(error_codes_js_1.ERR.nestedComment);
  1925. }
  1926. this.state = State.COMMENT_END;
  1927. this._stateCommentEnd(cp);
  1928. }
  1929. // Comment end dash state
  1930. //------------------------------------------------------------------
  1931. _stateCommentEndDash(cp) {
  1932. const token = this.currentToken;
  1933. switch (cp) {
  1934. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1935. this.state = State.COMMENT_END;
  1936. break;
  1937. }
  1938. case unicode_js_1.CODE_POINTS.EOF: {
  1939. this._err(error_codes_js_1.ERR.eofInComment);
  1940. this.emitCurrentComment(token);
  1941. this._emitEOFToken();
  1942. break;
  1943. }
  1944. default: {
  1945. token.data += '-';
  1946. this.state = State.COMMENT;
  1947. this._stateComment(cp);
  1948. }
  1949. }
  1950. }
  1951. // Comment end state
  1952. //------------------------------------------------------------------
  1953. _stateCommentEnd(cp) {
  1954. const token = this.currentToken;
  1955. switch (cp) {
  1956. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1957. this.state = State.DATA;
  1958. this.emitCurrentComment(token);
  1959. break;
  1960. }
  1961. case unicode_js_1.CODE_POINTS.EXCLAMATION_MARK: {
  1962. this.state = State.COMMENT_END_BANG;
  1963. break;
  1964. }
  1965. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1966. token.data += '-';
  1967. break;
  1968. }
  1969. case unicode_js_1.CODE_POINTS.EOF: {
  1970. this._err(error_codes_js_1.ERR.eofInComment);
  1971. this.emitCurrentComment(token);
  1972. this._emitEOFToken();
  1973. break;
  1974. }
  1975. default: {
  1976. token.data += '--';
  1977. this.state = State.COMMENT;
  1978. this._stateComment(cp);
  1979. }
  1980. }
  1981. }
  1982. // Comment end bang state
  1983. //------------------------------------------------------------------
  1984. _stateCommentEndBang(cp) {
  1985. const token = this.currentToken;
  1986. switch (cp) {
  1987. case unicode_js_1.CODE_POINTS.HYPHEN_MINUS: {
  1988. token.data += '--!';
  1989. this.state = State.COMMENT_END_DASH;
  1990. break;
  1991. }
  1992. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  1993. this._err(error_codes_js_1.ERR.incorrectlyClosedComment);
  1994. this.state = State.DATA;
  1995. this.emitCurrentComment(token);
  1996. break;
  1997. }
  1998. case unicode_js_1.CODE_POINTS.EOF: {
  1999. this._err(error_codes_js_1.ERR.eofInComment);
  2000. this.emitCurrentComment(token);
  2001. this._emitEOFToken();
  2002. break;
  2003. }
  2004. default: {
  2005. token.data += '--!';
  2006. this.state = State.COMMENT;
  2007. this._stateComment(cp);
  2008. }
  2009. }
  2010. }
  2011. // DOCTYPE state
  2012. //------------------------------------------------------------------
  2013. _stateDoctype(cp) {
  2014. switch (cp) {
  2015. case unicode_js_1.CODE_POINTS.SPACE:
  2016. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2017. case unicode_js_1.CODE_POINTS.TABULATION:
  2018. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2019. this.state = State.BEFORE_DOCTYPE_NAME;
  2020. break;
  2021. }
  2022. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2023. this.state = State.BEFORE_DOCTYPE_NAME;
  2024. this._stateBeforeDoctypeName(cp);
  2025. break;
  2026. }
  2027. case unicode_js_1.CODE_POINTS.EOF: {
  2028. this._err(error_codes_js_1.ERR.eofInDoctype);
  2029. this._createDoctypeToken(null);
  2030. const token = this.currentToken;
  2031. token.forceQuirks = true;
  2032. this.emitCurrentDoctype(token);
  2033. this._emitEOFToken();
  2034. break;
  2035. }
  2036. default: {
  2037. this._err(error_codes_js_1.ERR.missingWhitespaceBeforeDoctypeName);
  2038. this.state = State.BEFORE_DOCTYPE_NAME;
  2039. this._stateBeforeDoctypeName(cp);
  2040. }
  2041. }
  2042. }
  2043. // Before DOCTYPE name state
  2044. //------------------------------------------------------------------
  2045. _stateBeforeDoctypeName(cp) {
  2046. if (isAsciiUpper(cp)) {
  2047. this._createDoctypeToken(String.fromCharCode(toAsciiLower(cp)));
  2048. this.state = State.DOCTYPE_NAME;
  2049. }
  2050. else
  2051. switch (cp) {
  2052. case unicode_js_1.CODE_POINTS.SPACE:
  2053. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2054. case unicode_js_1.CODE_POINTS.TABULATION:
  2055. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2056. // Ignore whitespace
  2057. break;
  2058. }
  2059. case unicode_js_1.CODE_POINTS.NULL: {
  2060. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2061. this._createDoctypeToken(unicode_js_1.REPLACEMENT_CHARACTER);
  2062. this.state = State.DOCTYPE_NAME;
  2063. break;
  2064. }
  2065. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2066. this._err(error_codes_js_1.ERR.missingDoctypeName);
  2067. this._createDoctypeToken(null);
  2068. const token = this.currentToken;
  2069. token.forceQuirks = true;
  2070. this.emitCurrentDoctype(token);
  2071. this.state = State.DATA;
  2072. break;
  2073. }
  2074. case unicode_js_1.CODE_POINTS.EOF: {
  2075. this._err(error_codes_js_1.ERR.eofInDoctype);
  2076. this._createDoctypeToken(null);
  2077. const token = this.currentToken;
  2078. token.forceQuirks = true;
  2079. this.emitCurrentDoctype(token);
  2080. this._emitEOFToken();
  2081. break;
  2082. }
  2083. default: {
  2084. this._createDoctypeToken(String.fromCodePoint(cp));
  2085. this.state = State.DOCTYPE_NAME;
  2086. }
  2087. }
  2088. }
  2089. // DOCTYPE name state
  2090. //------------------------------------------------------------------
  2091. _stateDoctypeName(cp) {
  2092. const token = this.currentToken;
  2093. switch (cp) {
  2094. case unicode_js_1.CODE_POINTS.SPACE:
  2095. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2096. case unicode_js_1.CODE_POINTS.TABULATION:
  2097. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2098. this.state = State.AFTER_DOCTYPE_NAME;
  2099. break;
  2100. }
  2101. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2102. this.state = State.DATA;
  2103. this.emitCurrentDoctype(token);
  2104. break;
  2105. }
  2106. case unicode_js_1.CODE_POINTS.NULL: {
  2107. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2108. token.name += unicode_js_1.REPLACEMENT_CHARACTER;
  2109. break;
  2110. }
  2111. case unicode_js_1.CODE_POINTS.EOF: {
  2112. this._err(error_codes_js_1.ERR.eofInDoctype);
  2113. token.forceQuirks = true;
  2114. this.emitCurrentDoctype(token);
  2115. this._emitEOFToken();
  2116. break;
  2117. }
  2118. default: {
  2119. token.name += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp);
  2120. }
  2121. }
  2122. }
  2123. // After DOCTYPE name state
  2124. //------------------------------------------------------------------
  2125. _stateAfterDoctypeName(cp) {
  2126. const token = this.currentToken;
  2127. switch (cp) {
  2128. case unicode_js_1.CODE_POINTS.SPACE:
  2129. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2130. case unicode_js_1.CODE_POINTS.TABULATION:
  2131. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2132. // Ignore whitespace
  2133. break;
  2134. }
  2135. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2136. this.state = State.DATA;
  2137. this.emitCurrentDoctype(token);
  2138. break;
  2139. }
  2140. case unicode_js_1.CODE_POINTS.EOF: {
  2141. this._err(error_codes_js_1.ERR.eofInDoctype);
  2142. token.forceQuirks = true;
  2143. this.emitCurrentDoctype(token);
  2144. this._emitEOFToken();
  2145. break;
  2146. }
  2147. default: {
  2148. if (this._consumeSequenceIfMatch(unicode_js_1.SEQUENCES.PUBLIC, false)) {
  2149. this.state = State.AFTER_DOCTYPE_PUBLIC_KEYWORD;
  2150. }
  2151. else if (this._consumeSequenceIfMatch(unicode_js_1.SEQUENCES.SYSTEM, false)) {
  2152. this.state = State.AFTER_DOCTYPE_SYSTEM_KEYWORD;
  2153. }
  2154. //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup
  2155. //results are no longer valid and we will need to start over.
  2156. else if (!this._ensureHibernation()) {
  2157. this._err(error_codes_js_1.ERR.invalidCharacterSequenceAfterDoctypeName);
  2158. token.forceQuirks = true;
  2159. this.state = State.BOGUS_DOCTYPE;
  2160. this._stateBogusDoctype(cp);
  2161. }
  2162. }
  2163. }
  2164. }
  2165. // After DOCTYPE public keyword state
  2166. //------------------------------------------------------------------
  2167. _stateAfterDoctypePublicKeyword(cp) {
  2168. const token = this.currentToken;
  2169. switch (cp) {
  2170. case unicode_js_1.CODE_POINTS.SPACE:
  2171. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2172. case unicode_js_1.CODE_POINTS.TABULATION:
  2173. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2174. this.state = State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
  2175. break;
  2176. }
  2177. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2178. this._err(error_codes_js_1.ERR.missingWhitespaceAfterDoctypePublicKeyword);
  2179. token.publicId = '';
  2180. this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
  2181. break;
  2182. }
  2183. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2184. this._err(error_codes_js_1.ERR.missingWhitespaceAfterDoctypePublicKeyword);
  2185. token.publicId = '';
  2186. this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
  2187. break;
  2188. }
  2189. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2190. this._err(error_codes_js_1.ERR.missingDoctypePublicIdentifier);
  2191. token.forceQuirks = true;
  2192. this.state = State.DATA;
  2193. this.emitCurrentDoctype(token);
  2194. break;
  2195. }
  2196. case unicode_js_1.CODE_POINTS.EOF: {
  2197. this._err(error_codes_js_1.ERR.eofInDoctype);
  2198. token.forceQuirks = true;
  2199. this.emitCurrentDoctype(token);
  2200. this._emitEOFToken();
  2201. break;
  2202. }
  2203. default: {
  2204. this._err(error_codes_js_1.ERR.missingQuoteBeforeDoctypePublicIdentifier);
  2205. token.forceQuirks = true;
  2206. this.state = State.BOGUS_DOCTYPE;
  2207. this._stateBogusDoctype(cp);
  2208. }
  2209. }
  2210. }
  2211. // Before DOCTYPE public identifier state
  2212. //------------------------------------------------------------------
  2213. _stateBeforeDoctypePublicIdentifier(cp) {
  2214. const token = this.currentToken;
  2215. switch (cp) {
  2216. case unicode_js_1.CODE_POINTS.SPACE:
  2217. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2218. case unicode_js_1.CODE_POINTS.TABULATION:
  2219. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2220. // Ignore whitespace
  2221. break;
  2222. }
  2223. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2224. token.publicId = '';
  2225. this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
  2226. break;
  2227. }
  2228. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2229. token.publicId = '';
  2230. this.state = State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
  2231. break;
  2232. }
  2233. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2234. this._err(error_codes_js_1.ERR.missingDoctypePublicIdentifier);
  2235. token.forceQuirks = true;
  2236. this.state = State.DATA;
  2237. this.emitCurrentDoctype(token);
  2238. break;
  2239. }
  2240. case unicode_js_1.CODE_POINTS.EOF: {
  2241. this._err(error_codes_js_1.ERR.eofInDoctype);
  2242. token.forceQuirks = true;
  2243. this.emitCurrentDoctype(token);
  2244. this._emitEOFToken();
  2245. break;
  2246. }
  2247. default: {
  2248. this._err(error_codes_js_1.ERR.missingQuoteBeforeDoctypePublicIdentifier);
  2249. token.forceQuirks = true;
  2250. this.state = State.BOGUS_DOCTYPE;
  2251. this._stateBogusDoctype(cp);
  2252. }
  2253. }
  2254. }
  2255. // DOCTYPE public identifier (double-quoted) state
  2256. //------------------------------------------------------------------
  2257. _stateDoctypePublicIdentifierDoubleQuoted(cp) {
  2258. const token = this.currentToken;
  2259. switch (cp) {
  2260. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2261. this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER;
  2262. break;
  2263. }
  2264. case unicode_js_1.CODE_POINTS.NULL: {
  2265. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2266. token.publicId += unicode_js_1.REPLACEMENT_CHARACTER;
  2267. break;
  2268. }
  2269. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2270. this._err(error_codes_js_1.ERR.abruptDoctypePublicIdentifier);
  2271. token.forceQuirks = true;
  2272. this.emitCurrentDoctype(token);
  2273. this.state = State.DATA;
  2274. break;
  2275. }
  2276. case unicode_js_1.CODE_POINTS.EOF: {
  2277. this._err(error_codes_js_1.ERR.eofInDoctype);
  2278. token.forceQuirks = true;
  2279. this.emitCurrentDoctype(token);
  2280. this._emitEOFToken();
  2281. break;
  2282. }
  2283. default: {
  2284. token.publicId += String.fromCodePoint(cp);
  2285. }
  2286. }
  2287. }
  2288. // DOCTYPE public identifier (single-quoted) state
  2289. //------------------------------------------------------------------
  2290. _stateDoctypePublicIdentifierSingleQuoted(cp) {
  2291. const token = this.currentToken;
  2292. switch (cp) {
  2293. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2294. this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER;
  2295. break;
  2296. }
  2297. case unicode_js_1.CODE_POINTS.NULL: {
  2298. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2299. token.publicId += unicode_js_1.REPLACEMENT_CHARACTER;
  2300. break;
  2301. }
  2302. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2303. this._err(error_codes_js_1.ERR.abruptDoctypePublicIdentifier);
  2304. token.forceQuirks = true;
  2305. this.emitCurrentDoctype(token);
  2306. this.state = State.DATA;
  2307. break;
  2308. }
  2309. case unicode_js_1.CODE_POINTS.EOF: {
  2310. this._err(error_codes_js_1.ERR.eofInDoctype);
  2311. token.forceQuirks = true;
  2312. this.emitCurrentDoctype(token);
  2313. this._emitEOFToken();
  2314. break;
  2315. }
  2316. default: {
  2317. token.publicId += String.fromCodePoint(cp);
  2318. }
  2319. }
  2320. }
  2321. // After DOCTYPE public identifier state
  2322. //------------------------------------------------------------------
  2323. _stateAfterDoctypePublicIdentifier(cp) {
  2324. const token = this.currentToken;
  2325. switch (cp) {
  2326. case unicode_js_1.CODE_POINTS.SPACE:
  2327. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2328. case unicode_js_1.CODE_POINTS.TABULATION:
  2329. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2330. this.state = State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
  2331. break;
  2332. }
  2333. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2334. this.state = State.DATA;
  2335. this.emitCurrentDoctype(token);
  2336. break;
  2337. }
  2338. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2339. this._err(error_codes_js_1.ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
  2340. token.systemId = '';
  2341. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
  2342. break;
  2343. }
  2344. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2345. this._err(error_codes_js_1.ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
  2346. token.systemId = '';
  2347. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
  2348. break;
  2349. }
  2350. case unicode_js_1.CODE_POINTS.EOF: {
  2351. this._err(error_codes_js_1.ERR.eofInDoctype);
  2352. token.forceQuirks = true;
  2353. this.emitCurrentDoctype(token);
  2354. this._emitEOFToken();
  2355. break;
  2356. }
  2357. default: {
  2358. this._err(error_codes_js_1.ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  2359. token.forceQuirks = true;
  2360. this.state = State.BOGUS_DOCTYPE;
  2361. this._stateBogusDoctype(cp);
  2362. }
  2363. }
  2364. }
  2365. // Between DOCTYPE public and system identifiers state
  2366. //------------------------------------------------------------------
  2367. _stateBetweenDoctypePublicAndSystemIdentifiers(cp) {
  2368. const token = this.currentToken;
  2369. switch (cp) {
  2370. case unicode_js_1.CODE_POINTS.SPACE:
  2371. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2372. case unicode_js_1.CODE_POINTS.TABULATION:
  2373. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2374. // Ignore whitespace
  2375. break;
  2376. }
  2377. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2378. this.emitCurrentDoctype(token);
  2379. this.state = State.DATA;
  2380. break;
  2381. }
  2382. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2383. token.systemId = '';
  2384. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
  2385. break;
  2386. }
  2387. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2388. token.systemId = '';
  2389. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
  2390. break;
  2391. }
  2392. case unicode_js_1.CODE_POINTS.EOF: {
  2393. this._err(error_codes_js_1.ERR.eofInDoctype);
  2394. token.forceQuirks = true;
  2395. this.emitCurrentDoctype(token);
  2396. this._emitEOFToken();
  2397. break;
  2398. }
  2399. default: {
  2400. this._err(error_codes_js_1.ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  2401. token.forceQuirks = true;
  2402. this.state = State.BOGUS_DOCTYPE;
  2403. this._stateBogusDoctype(cp);
  2404. }
  2405. }
  2406. }
  2407. // After DOCTYPE system keyword state
  2408. //------------------------------------------------------------------
  2409. _stateAfterDoctypeSystemKeyword(cp) {
  2410. const token = this.currentToken;
  2411. switch (cp) {
  2412. case unicode_js_1.CODE_POINTS.SPACE:
  2413. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2414. case unicode_js_1.CODE_POINTS.TABULATION:
  2415. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2416. this.state = State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
  2417. break;
  2418. }
  2419. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2420. this._err(error_codes_js_1.ERR.missingWhitespaceAfterDoctypeSystemKeyword);
  2421. token.systemId = '';
  2422. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
  2423. break;
  2424. }
  2425. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2426. this._err(error_codes_js_1.ERR.missingWhitespaceAfterDoctypeSystemKeyword);
  2427. token.systemId = '';
  2428. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
  2429. break;
  2430. }
  2431. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2432. this._err(error_codes_js_1.ERR.missingDoctypeSystemIdentifier);
  2433. token.forceQuirks = true;
  2434. this.state = State.DATA;
  2435. this.emitCurrentDoctype(token);
  2436. break;
  2437. }
  2438. case unicode_js_1.CODE_POINTS.EOF: {
  2439. this._err(error_codes_js_1.ERR.eofInDoctype);
  2440. token.forceQuirks = true;
  2441. this.emitCurrentDoctype(token);
  2442. this._emitEOFToken();
  2443. break;
  2444. }
  2445. default: {
  2446. this._err(error_codes_js_1.ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  2447. token.forceQuirks = true;
  2448. this.state = State.BOGUS_DOCTYPE;
  2449. this._stateBogusDoctype(cp);
  2450. }
  2451. }
  2452. }
  2453. // Before DOCTYPE system identifier state
  2454. //------------------------------------------------------------------
  2455. _stateBeforeDoctypeSystemIdentifier(cp) {
  2456. const token = this.currentToken;
  2457. switch (cp) {
  2458. case unicode_js_1.CODE_POINTS.SPACE:
  2459. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2460. case unicode_js_1.CODE_POINTS.TABULATION:
  2461. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2462. // Ignore whitespace
  2463. break;
  2464. }
  2465. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2466. token.systemId = '';
  2467. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
  2468. break;
  2469. }
  2470. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2471. token.systemId = '';
  2472. this.state = State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
  2473. break;
  2474. }
  2475. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2476. this._err(error_codes_js_1.ERR.missingDoctypeSystemIdentifier);
  2477. token.forceQuirks = true;
  2478. this.state = State.DATA;
  2479. this.emitCurrentDoctype(token);
  2480. break;
  2481. }
  2482. case unicode_js_1.CODE_POINTS.EOF: {
  2483. this._err(error_codes_js_1.ERR.eofInDoctype);
  2484. token.forceQuirks = true;
  2485. this.emitCurrentDoctype(token);
  2486. this._emitEOFToken();
  2487. break;
  2488. }
  2489. default: {
  2490. this._err(error_codes_js_1.ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  2491. token.forceQuirks = true;
  2492. this.state = State.BOGUS_DOCTYPE;
  2493. this._stateBogusDoctype(cp);
  2494. }
  2495. }
  2496. }
  2497. // DOCTYPE system identifier (double-quoted) state
  2498. //------------------------------------------------------------------
  2499. _stateDoctypeSystemIdentifierDoubleQuoted(cp) {
  2500. const token = this.currentToken;
  2501. switch (cp) {
  2502. case unicode_js_1.CODE_POINTS.QUOTATION_MARK: {
  2503. this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
  2504. break;
  2505. }
  2506. case unicode_js_1.CODE_POINTS.NULL: {
  2507. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2508. token.systemId += unicode_js_1.REPLACEMENT_CHARACTER;
  2509. break;
  2510. }
  2511. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2512. this._err(error_codes_js_1.ERR.abruptDoctypeSystemIdentifier);
  2513. token.forceQuirks = true;
  2514. this.emitCurrentDoctype(token);
  2515. this.state = State.DATA;
  2516. break;
  2517. }
  2518. case unicode_js_1.CODE_POINTS.EOF: {
  2519. this._err(error_codes_js_1.ERR.eofInDoctype);
  2520. token.forceQuirks = true;
  2521. this.emitCurrentDoctype(token);
  2522. this._emitEOFToken();
  2523. break;
  2524. }
  2525. default: {
  2526. token.systemId += String.fromCodePoint(cp);
  2527. }
  2528. }
  2529. }
  2530. // DOCTYPE system identifier (single-quoted) state
  2531. //------------------------------------------------------------------
  2532. _stateDoctypeSystemIdentifierSingleQuoted(cp) {
  2533. const token = this.currentToken;
  2534. switch (cp) {
  2535. case unicode_js_1.CODE_POINTS.APOSTROPHE: {
  2536. this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
  2537. break;
  2538. }
  2539. case unicode_js_1.CODE_POINTS.NULL: {
  2540. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2541. token.systemId += unicode_js_1.REPLACEMENT_CHARACTER;
  2542. break;
  2543. }
  2544. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2545. this._err(error_codes_js_1.ERR.abruptDoctypeSystemIdentifier);
  2546. token.forceQuirks = true;
  2547. this.emitCurrentDoctype(token);
  2548. this.state = State.DATA;
  2549. break;
  2550. }
  2551. case unicode_js_1.CODE_POINTS.EOF: {
  2552. this._err(error_codes_js_1.ERR.eofInDoctype);
  2553. token.forceQuirks = true;
  2554. this.emitCurrentDoctype(token);
  2555. this._emitEOFToken();
  2556. break;
  2557. }
  2558. default: {
  2559. token.systemId += String.fromCodePoint(cp);
  2560. }
  2561. }
  2562. }
  2563. // After DOCTYPE system identifier state
  2564. //------------------------------------------------------------------
  2565. _stateAfterDoctypeSystemIdentifier(cp) {
  2566. const token = this.currentToken;
  2567. switch (cp) {
  2568. case unicode_js_1.CODE_POINTS.SPACE:
  2569. case unicode_js_1.CODE_POINTS.LINE_FEED:
  2570. case unicode_js_1.CODE_POINTS.TABULATION:
  2571. case unicode_js_1.CODE_POINTS.FORM_FEED: {
  2572. // Ignore whitespace
  2573. break;
  2574. }
  2575. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2576. this.emitCurrentDoctype(token);
  2577. this.state = State.DATA;
  2578. break;
  2579. }
  2580. case unicode_js_1.CODE_POINTS.EOF: {
  2581. this._err(error_codes_js_1.ERR.eofInDoctype);
  2582. token.forceQuirks = true;
  2583. this.emitCurrentDoctype(token);
  2584. this._emitEOFToken();
  2585. break;
  2586. }
  2587. default: {
  2588. this._err(error_codes_js_1.ERR.unexpectedCharacterAfterDoctypeSystemIdentifier);
  2589. this.state = State.BOGUS_DOCTYPE;
  2590. this._stateBogusDoctype(cp);
  2591. }
  2592. }
  2593. }
  2594. // Bogus DOCTYPE state
  2595. //------------------------------------------------------------------
  2596. _stateBogusDoctype(cp) {
  2597. const token = this.currentToken;
  2598. switch (cp) {
  2599. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2600. this.emitCurrentDoctype(token);
  2601. this.state = State.DATA;
  2602. break;
  2603. }
  2604. case unicode_js_1.CODE_POINTS.NULL: {
  2605. this._err(error_codes_js_1.ERR.unexpectedNullCharacter);
  2606. break;
  2607. }
  2608. case unicode_js_1.CODE_POINTS.EOF: {
  2609. this.emitCurrentDoctype(token);
  2610. this._emitEOFToken();
  2611. break;
  2612. }
  2613. default:
  2614. // Do nothing
  2615. }
  2616. }
  2617. // CDATA section state
  2618. //------------------------------------------------------------------
  2619. _stateCdataSection(cp) {
  2620. switch (cp) {
  2621. case unicode_js_1.CODE_POINTS.RIGHT_SQUARE_BRACKET: {
  2622. this.state = State.CDATA_SECTION_BRACKET;
  2623. break;
  2624. }
  2625. case unicode_js_1.CODE_POINTS.EOF: {
  2626. this._err(error_codes_js_1.ERR.eofInCdata);
  2627. this._emitEOFToken();
  2628. break;
  2629. }
  2630. default: {
  2631. this._emitCodePoint(cp);
  2632. }
  2633. }
  2634. }
  2635. // CDATA section bracket state
  2636. //------------------------------------------------------------------
  2637. _stateCdataSectionBracket(cp) {
  2638. if (cp === unicode_js_1.CODE_POINTS.RIGHT_SQUARE_BRACKET) {
  2639. this.state = State.CDATA_SECTION_END;
  2640. }
  2641. else {
  2642. this._emitChars(']');
  2643. this.state = State.CDATA_SECTION;
  2644. this._stateCdataSection(cp);
  2645. }
  2646. }
  2647. // CDATA section end state
  2648. //------------------------------------------------------------------
  2649. _stateCdataSectionEnd(cp) {
  2650. switch (cp) {
  2651. case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: {
  2652. this.state = State.DATA;
  2653. break;
  2654. }
  2655. case unicode_js_1.CODE_POINTS.RIGHT_SQUARE_BRACKET: {
  2656. this._emitChars(']');
  2657. break;
  2658. }
  2659. default: {
  2660. this._emitChars(']]');
  2661. this.state = State.CDATA_SECTION;
  2662. this._stateCdataSection(cp);
  2663. }
  2664. }
  2665. }
  2666. // Character reference state
  2667. //------------------------------------------------------------------
  2668. _stateCharacterReference() {
  2669. let length = this.entityDecoder.write(this.preprocessor.html, this.preprocessor.pos);
  2670. if (length < 0) {
  2671. if (this.preprocessor.lastChunkWritten) {
  2672. length = this.entityDecoder.end();
  2673. }
  2674. else {
  2675. // Wait for the rest of the entity.
  2676. this.active = false;
  2677. // Mark the entire buffer as read.
  2678. this.preprocessor.pos = this.preprocessor.html.length - 1;
  2679. this.consumedAfterSnapshot = 0;
  2680. this.preprocessor.endOfChunkHit = true;
  2681. return;
  2682. }
  2683. }
  2684. if (length === 0) {
  2685. // This was not a valid entity. Go back to the beginning, and
  2686. // figure out what to do.
  2687. this.preprocessor.pos = this.entityStartPos;
  2688. this._flushCodePointConsumedAsCharacterReference(unicode_js_1.CODE_POINTS.AMPERSAND);
  2689. this.state =
  2690. !this._isCharacterReferenceInAttribute() && isAsciiAlphaNumeric(this.preprocessor.peek(1))
  2691. ? State.AMBIGUOUS_AMPERSAND
  2692. : this.returnState;
  2693. }
  2694. else {
  2695. // We successfully parsed an entity. Switch to the return state.
  2696. this.state = this.returnState;
  2697. }
  2698. }
  2699. // Ambiguos ampersand state
  2700. //------------------------------------------------------------------
  2701. _stateAmbiguousAmpersand(cp) {
  2702. if (isAsciiAlphaNumeric(cp)) {
  2703. this._flushCodePointConsumedAsCharacterReference(cp);
  2704. }
  2705. else {
  2706. if (cp === unicode_js_1.CODE_POINTS.SEMICOLON) {
  2707. this._err(error_codes_js_1.ERR.unknownNamedCharacterReference);
  2708. }
  2709. this.state = this.returnState;
  2710. this._callState(cp);
  2711. }
  2712. }
  2713. }
  2714. exports.Tokenizer = Tokenizer;