preprocessor.js 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", { value: true });
  3. exports.Preprocessor = void 0;
  4. const unicode_js_1 = require("../common/unicode.js");
  5. const error_codes_js_1 = require("../common/error-codes.js");
  6. //Const
  7. const DEFAULT_BUFFER_WATERLINE = 1 << 16;
  8. //Preprocessor
  9. //NOTE: HTML input preprocessing
  10. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
  11. class Preprocessor {
  12. constructor(handler) {
  13. this.handler = handler;
  14. this.html = '';
  15. this.pos = -1;
  16. // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
  17. this.lastGapPos = -2;
  18. this.gapStack = [];
  19. this.skipNextNewLine = false;
  20. this.lastChunkWritten = false;
  21. this.endOfChunkHit = false;
  22. this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
  23. this.isEol = false;
  24. this.lineStartPos = 0;
  25. this.droppedBufferSize = 0;
  26. this.line = 1;
  27. //NOTE: avoid reporting errors twice on advance/retreat
  28. this.lastErrOffset = -1;
  29. }
  30. /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */
  31. get col() {
  32. return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos);
  33. }
  34. get offset() {
  35. return this.droppedBufferSize + this.pos;
  36. }
  37. getError(code, cpOffset) {
  38. const { line, col, offset } = this;
  39. const startCol = col + cpOffset;
  40. const startOffset = offset + cpOffset;
  41. return {
  42. code,
  43. startLine: line,
  44. endLine: line,
  45. startCol,
  46. endCol: startCol,
  47. startOffset,
  48. endOffset: startOffset,
  49. };
  50. }
  51. _err(code) {
  52. if (this.handler.onParseError && this.lastErrOffset !== this.offset) {
  53. this.lastErrOffset = this.offset;
  54. this.handler.onParseError(this.getError(code, 0));
  55. }
  56. }
  57. _addGap() {
  58. this.gapStack.push(this.lastGapPos);
  59. this.lastGapPos = this.pos;
  60. }
  61. _processSurrogate(cp) {
  62. //NOTE: try to peek a surrogate pair
  63. if (this.pos !== this.html.length - 1) {
  64. const nextCp = this.html.charCodeAt(this.pos + 1);
  65. if ((0, unicode_js_1.isSurrogatePair)(nextCp)) {
  66. //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
  67. this.pos++;
  68. //NOTE: add a gap that should be avoided during retreat
  69. this._addGap();
  70. return (0, unicode_js_1.getSurrogatePairCodePoint)(cp, nextCp);
  71. }
  72. }
  73. //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet.
  74. else if (!this.lastChunkWritten) {
  75. this.endOfChunkHit = true;
  76. return unicode_js_1.CODE_POINTS.EOF;
  77. }
  78. //NOTE: isolated surrogate
  79. this._err(error_codes_js_1.ERR.surrogateInInputStream);
  80. return cp;
  81. }
  82. willDropParsedChunk() {
  83. return this.pos > this.bufferWaterline;
  84. }
  85. dropParsedChunk() {
  86. if (this.willDropParsedChunk()) {
  87. this.html = this.html.substring(this.pos);
  88. this.lineStartPos -= this.pos;
  89. this.droppedBufferSize += this.pos;
  90. this.pos = 0;
  91. this.lastGapPos = -2;
  92. this.gapStack.length = 0;
  93. }
  94. }
  95. write(chunk, isLastChunk) {
  96. if (this.html.length > 0) {
  97. this.html += chunk;
  98. }
  99. else {
  100. this.html = chunk;
  101. }
  102. this.endOfChunkHit = false;
  103. this.lastChunkWritten = isLastChunk;
  104. }
  105. insertHtmlAtCurrentPos(chunk) {
  106. this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1);
  107. this.endOfChunkHit = false;
  108. }
  109. startsWith(pattern, caseSensitive) {
  110. // Check if our buffer has enough characters
  111. if (this.pos + pattern.length > this.html.length) {
  112. this.endOfChunkHit = !this.lastChunkWritten;
  113. return false;
  114. }
  115. if (caseSensitive) {
  116. return this.html.startsWith(pattern, this.pos);
  117. }
  118. for (let i = 0; i < pattern.length; i++) {
  119. const cp = this.html.charCodeAt(this.pos + i) | 0x20;
  120. if (cp !== pattern.charCodeAt(i)) {
  121. return false;
  122. }
  123. }
  124. return true;
  125. }
  126. peek(offset) {
  127. const pos = this.pos + offset;
  128. if (pos >= this.html.length) {
  129. this.endOfChunkHit = !this.lastChunkWritten;
  130. return unicode_js_1.CODE_POINTS.EOF;
  131. }
  132. const code = this.html.charCodeAt(pos);
  133. return code === unicode_js_1.CODE_POINTS.CARRIAGE_RETURN ? unicode_js_1.CODE_POINTS.LINE_FEED : code;
  134. }
  135. advance() {
  136. this.pos++;
  137. //NOTE: LF should be in the last column of the line
  138. if (this.isEol) {
  139. this.isEol = false;
  140. this.line++;
  141. this.lineStartPos = this.pos;
  142. }
  143. if (this.pos >= this.html.length) {
  144. this.endOfChunkHit = !this.lastChunkWritten;
  145. return unicode_js_1.CODE_POINTS.EOF;
  146. }
  147. let cp = this.html.charCodeAt(this.pos);
  148. //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
  149. if (cp === unicode_js_1.CODE_POINTS.CARRIAGE_RETURN) {
  150. this.isEol = true;
  151. this.skipNextNewLine = true;
  152. return unicode_js_1.CODE_POINTS.LINE_FEED;
  153. }
  154. //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
  155. //must be ignored.
  156. if (cp === unicode_js_1.CODE_POINTS.LINE_FEED) {
  157. this.isEol = true;
  158. if (this.skipNextNewLine) {
  159. // `line` will be bumped again in the recursive call.
  160. this.line--;
  161. this.skipNextNewLine = false;
  162. this._addGap();
  163. return this.advance();
  164. }
  165. }
  166. this.skipNextNewLine = false;
  167. if ((0, unicode_js_1.isSurrogate)(cp)) {
  168. cp = this._processSurrogate(cp);
  169. }
  170. //OPTIMIZATION: first check if code point is in the common allowed
  171. //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
  172. //before going into detailed performance cost validation.
  173. const isCommonValidRange = this.handler.onParseError === null ||
  174. (cp > 0x1f && cp < 0x7f) ||
  175. cp === unicode_js_1.CODE_POINTS.LINE_FEED ||
  176. cp === unicode_js_1.CODE_POINTS.CARRIAGE_RETURN ||
  177. (cp > 0x9f && cp < 64976);
  178. if (!isCommonValidRange) {
  179. this._checkForProblematicCharacters(cp);
  180. }
  181. return cp;
  182. }
  183. _checkForProblematicCharacters(cp) {
  184. if ((0, unicode_js_1.isControlCodePoint)(cp)) {
  185. this._err(error_codes_js_1.ERR.controlCharacterInInputStream);
  186. }
  187. else if ((0, unicode_js_1.isUndefinedCodePoint)(cp)) {
  188. this._err(error_codes_js_1.ERR.noncharacterInInputStream);
  189. }
  190. }
  191. retreat(count) {
  192. this.pos -= count;
  193. while (this.pos < this.lastGapPos) {
  194. this.lastGapPos = this.gapStack.pop();
  195. this.pos--;
  196. }
  197. this.isEol = false;
  198. }
  199. }
  200. exports.Preprocessor = Preprocessor;