preprocessor.js 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. import { CODE_POINTS as $, getSurrogatePairCodePoint, isControlCodePoint, isSurrogate, isSurrogatePair, isUndefinedCodePoint, } from '../common/unicode.js';
  2. import { ERR } from '../common/error-codes.js';
  3. //Const
  4. const DEFAULT_BUFFER_WATERLINE = 1 << 16;
  5. //Preprocessor
  6. //NOTE: HTML input preprocessing
  7. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
  8. export class Preprocessor {
  9. constructor(handler) {
  10. this.handler = handler;
  11. this.html = '';
  12. this.pos = -1;
  13. // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
  14. this.lastGapPos = -2;
  15. this.gapStack = [];
  16. this.skipNextNewLine = false;
  17. this.lastChunkWritten = false;
  18. this.endOfChunkHit = false;
  19. this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
  20. this.isEol = false;
  21. this.lineStartPos = 0;
  22. this.droppedBufferSize = 0;
  23. this.line = 1;
  24. //NOTE: avoid reporting errors twice on advance/retreat
  25. this.lastErrOffset = -1;
  26. }
  27. /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */
  28. get col() {
  29. return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos);
  30. }
  31. get offset() {
  32. return this.droppedBufferSize + this.pos;
  33. }
  34. getError(code, cpOffset) {
  35. const { line, col, offset } = this;
  36. const startCol = col + cpOffset;
  37. const startOffset = offset + cpOffset;
  38. return {
  39. code,
  40. startLine: line,
  41. endLine: line,
  42. startCol,
  43. endCol: startCol,
  44. startOffset,
  45. endOffset: startOffset,
  46. };
  47. }
  48. _err(code) {
  49. if (this.handler.onParseError && this.lastErrOffset !== this.offset) {
  50. this.lastErrOffset = this.offset;
  51. this.handler.onParseError(this.getError(code, 0));
  52. }
  53. }
  54. _addGap() {
  55. this.gapStack.push(this.lastGapPos);
  56. this.lastGapPos = this.pos;
  57. }
  58. _processSurrogate(cp) {
  59. //NOTE: try to peek a surrogate pair
  60. if (this.pos !== this.html.length - 1) {
  61. const nextCp = this.html.charCodeAt(this.pos + 1);
  62. if (isSurrogatePair(nextCp)) {
  63. //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
  64. this.pos++;
  65. //NOTE: add a gap that should be avoided during retreat
  66. this._addGap();
  67. return getSurrogatePairCodePoint(cp, nextCp);
  68. }
  69. }
  70. //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet.
  71. else if (!this.lastChunkWritten) {
  72. this.endOfChunkHit = true;
  73. return $.EOF;
  74. }
  75. //NOTE: isolated surrogate
  76. this._err(ERR.surrogateInInputStream);
  77. return cp;
  78. }
  79. willDropParsedChunk() {
  80. return this.pos > this.bufferWaterline;
  81. }
  82. dropParsedChunk() {
  83. if (this.willDropParsedChunk()) {
  84. this.html = this.html.substring(this.pos);
  85. this.lineStartPos -= this.pos;
  86. this.droppedBufferSize += this.pos;
  87. this.pos = 0;
  88. this.lastGapPos = -2;
  89. this.gapStack.length = 0;
  90. }
  91. }
  92. write(chunk, isLastChunk) {
  93. if (this.html.length > 0) {
  94. this.html += chunk;
  95. }
  96. else {
  97. this.html = chunk;
  98. }
  99. this.endOfChunkHit = false;
  100. this.lastChunkWritten = isLastChunk;
  101. }
  102. insertHtmlAtCurrentPos(chunk) {
  103. this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1);
  104. this.endOfChunkHit = false;
  105. }
  106. startsWith(pattern, caseSensitive) {
  107. // Check if our buffer has enough characters
  108. if (this.pos + pattern.length > this.html.length) {
  109. this.endOfChunkHit = !this.lastChunkWritten;
  110. return false;
  111. }
  112. if (caseSensitive) {
  113. return this.html.startsWith(pattern, this.pos);
  114. }
  115. for (let i = 0; i < pattern.length; i++) {
  116. const cp = this.html.charCodeAt(this.pos + i) | 0x20;
  117. if (cp !== pattern.charCodeAt(i)) {
  118. return false;
  119. }
  120. }
  121. return true;
  122. }
  123. peek(offset) {
  124. const pos = this.pos + offset;
  125. if (pos >= this.html.length) {
  126. this.endOfChunkHit = !this.lastChunkWritten;
  127. return $.EOF;
  128. }
  129. const code = this.html.charCodeAt(pos);
  130. return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code;
  131. }
  132. advance() {
  133. this.pos++;
  134. //NOTE: LF should be in the last column of the line
  135. if (this.isEol) {
  136. this.isEol = false;
  137. this.line++;
  138. this.lineStartPos = this.pos;
  139. }
  140. if (this.pos >= this.html.length) {
  141. this.endOfChunkHit = !this.lastChunkWritten;
  142. return $.EOF;
  143. }
  144. let cp = this.html.charCodeAt(this.pos);
  145. //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
  146. if (cp === $.CARRIAGE_RETURN) {
  147. this.isEol = true;
  148. this.skipNextNewLine = true;
  149. return $.LINE_FEED;
  150. }
  151. //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
  152. //must be ignored.
  153. if (cp === $.LINE_FEED) {
  154. this.isEol = true;
  155. if (this.skipNextNewLine) {
  156. // `line` will be bumped again in the recursive call.
  157. this.line--;
  158. this.skipNextNewLine = false;
  159. this._addGap();
  160. return this.advance();
  161. }
  162. }
  163. this.skipNextNewLine = false;
  164. if (isSurrogate(cp)) {
  165. cp = this._processSurrogate(cp);
  166. }
  167. //OPTIMIZATION: first check if code point is in the common allowed
  168. //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
  169. //before going into detailed performance cost validation.
  170. const isCommonValidRange = this.handler.onParseError === null ||
  171. (cp > 0x1f && cp < 0x7f) ||
  172. cp === $.LINE_FEED ||
  173. cp === $.CARRIAGE_RETURN ||
  174. (cp > 0x9f && cp < 64976);
  175. if (!isCommonValidRange) {
  176. this._checkForProblematicCharacters(cp);
  177. }
  178. return cp;
  179. }
  180. _checkForProblematicCharacters(cp) {
  181. if (isControlCodePoint(cp)) {
  182. this._err(ERR.controlCharacterInInputStream);
  183. }
  184. else if (isUndefinedCodePoint(cp)) {
  185. this._err(ERR.noncharacterInInputStream);
  186. }
  187. }
  188. retreat(count) {
  189. this.pos -= count;
  190. while (this.pos < this.lastGapPos) {
  191. this.lastGapPos = this.gapStack.pop();
  192. this.pos--;
  193. }
  194. this.isEol = false;
  195. }
  196. }