index.js 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. /**
  2. * @file Batteries-included version of Cheerio. This module includes several
  3. * convenience methods for loading documents from various sources.
  4. */
  5. export * from './load-parse.js';
  6. export { contains, merge } from './static.js';
  7. import { adapter as htmlparser2Adapter } from 'parse5-htmlparser2-tree-adapter';
  8. import * as htmlparser2 from 'htmlparser2';
  9. import { ParserStream as Parse5Stream } from 'parse5-parser-stream';
  10. import { decodeBuffer, DecodeStream, } from 'encoding-sniffer';
  11. import * as undici from 'undici';
  12. import MIMEType from 'whatwg-mimetype';
  13. import { Writable, finished } from 'node:stream';
  14. import { flattenOptions, } from './options.js';
  15. import { load } from './load-parse.js';
  16. /**
  17. * Sniffs the encoding of a buffer, then creates a querying function bound to a
  18. * document created from the buffer.
  19. *
  20. * @category Loading
  21. * @example
  22. *
  23. * ```js
  24. * import * as cheerio from 'cheerio';
  25. *
  26. * const buffer = fs.readFileSync('index.html');
  27. * const $ = cheerio.fromBuffer(buffer);
  28. * ```
  29. *
  30. * @param buffer - The buffer to sniff the encoding of.
  31. * @param options - The options to pass to Cheerio.
  32. * @returns The loaded document.
  33. */
  34. export function loadBuffer(buffer, options = {}) {
  35. const opts = flattenOptions(options);
  36. const str = decodeBuffer(buffer, {
  37. defaultEncoding: (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252',
  38. ...options.encoding,
  39. });
  40. return load(str, opts);
  41. }
  42. function _stringStream(options, cb) {
  43. var _a;
  44. if (options === null || options === void 0 ? void 0 : options._useHtmlParser2) {
  45. const parser = htmlparser2.createDocumentStream((err, document) => cb(err, load(document)), options);
  46. return new Writable({
  47. decodeStrings: false,
  48. write(chunk, _encoding, callback) {
  49. if (typeof chunk !== 'string') {
  50. throw new TypeError('Expected a string');
  51. }
  52. parser.write(chunk);
  53. callback();
  54. },
  55. final(callback) {
  56. parser.end();
  57. callback();
  58. },
  59. });
  60. }
  61. options !== null && options !== void 0 ? options : (options = {});
  62. (_a = options.treeAdapter) !== null && _a !== void 0 ? _a : (options.treeAdapter = htmlparser2Adapter);
  63. if (options.scriptingEnabled !== false) {
  64. options.scriptingEnabled = true;
  65. }
  66. const stream = new Parse5Stream(options);
  67. finished(stream, (err) => cb(err, load(stream.document)));
  68. return stream;
  69. }
  70. /**
  71. * Creates a stream that parses a sequence of strings into a document.
  72. *
  73. * The stream is a `Writable` stream that accepts strings. When the stream is
  74. * finished, the callback is called with the loaded document.
  75. *
  76. * @category Loading
  77. * @example
  78. *
  79. * ```js
  80. * import * as cheerio from 'cheerio';
  81. * import * as fs from 'fs';
  82. *
  83. * const writeStream = cheerio.stringStream({}, (err, $) => {
  84. * if (err) {
  85. * // Handle error
  86. * }
  87. *
  88. * console.log($('h1').text());
  89. * // Output: Hello, world!
  90. * });
  91. *
  92. * fs.createReadStream('my-document.html', { encoding: 'utf8' }).pipe(
  93. * writeStream,
  94. * );
  95. * ```
  96. *
  97. * @param options - The options to pass to Cheerio.
  98. * @param cb - The callback to call when the stream is finished.
  99. * @returns The writable stream.
  100. */
  101. export function stringStream(options, cb) {
  102. return _stringStream(flattenOptions(options), cb);
  103. }
  104. /**
  105. * Parses a stream of buffers into a document.
  106. *
  107. * The stream is a `Writable` stream that accepts buffers. When the stream is
  108. * finished, the callback is called with the loaded document.
  109. *
  110. * @category Loading
  111. * @param options - The options to pass to Cheerio.
  112. * @param cb - The callback to call when the stream is finished.
  113. * @returns The writable stream.
  114. */
  115. export function decodeStream(options, cb) {
  116. var _a;
  117. const { encoding = {}, ...cheerioOptions } = options;
  118. const opts = flattenOptions(cheerioOptions);
  119. // Set the default encoding to UTF-8 for XML mode
  120. (_a = encoding.defaultEncoding) !== null && _a !== void 0 ? _a : (encoding.defaultEncoding = (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252');
  121. const decodeStream = new DecodeStream(encoding);
  122. const loadStream = _stringStream(opts, cb);
  123. decodeStream.pipe(loadStream);
  124. return decodeStream;
  125. }
  126. const defaultRequestOptions = {
  127. method: 'GET',
  128. // Allow redirects by default
  129. maxRedirections: 5,
  130. // NOTE: `throwOnError` currently doesn't work https://github.com/nodejs/undici/issues/1753
  131. throwOnError: true,
  132. // Set an Accept header
  133. headers: {
  134. accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  135. },
  136. };
  137. /**
  138. * `fromURL` loads a document from a URL.
  139. *
  140. * By default, redirects are allowed and non-2xx responses are rejected.
  141. *
  142. * @category Loading
  143. * @example
  144. *
  145. * ```js
  146. * import * as cheerio from 'cheerio';
  147. *
  148. * const $ = await cheerio.fromURL('https://example.com');
  149. * ```
  150. *
  151. * @param url - The URL to load the document from.
  152. * @param options - The options to pass to Cheerio.
  153. * @returns The loaded document.
  154. */
  155. export async function fromURL(url, options = {}) {
  156. var _a;
  157. const { requestOptions = defaultRequestOptions, encoding = {}, ...cheerioOptions } = options;
  158. let undiciStream;
  159. // Add headers if none were supplied.
  160. (_a = requestOptions.headers) !== null && _a !== void 0 ? _a : (requestOptions.headers = defaultRequestOptions.headers);
  161. const promise = new Promise((resolve, reject) => {
  162. undiciStream = undici.stream(url, requestOptions, (res) => {
  163. var _a, _b;
  164. const contentType = (_a = res.headers['content-type']) !== null && _a !== void 0 ? _a : 'text/html';
  165. const mimeType = new MIMEType(Array.isArray(contentType) ? contentType[0] : contentType);
  166. if (!mimeType.isHTML() && !mimeType.isXML()) {
  167. throw new RangeError(`The content-type "${contentType}" is neither HTML nor XML.`);
  168. }
  169. // Forward the charset from the header to the decodeStream.
  170. encoding.transportLayerEncodingLabel = mimeType.parameters.get('charset');
  171. /*
  172. * If we allow redirects, we will have entries in the history.
  173. * The last entry will be the final URL.
  174. */
  175. const history = (_b = res.context) === null || _b === void 0 ? void 0 : _b.history;
  176. const opts = {
  177. encoding,
  178. // Set XML mode based on the MIME type.
  179. xmlMode: mimeType.isXML(),
  180. // Set the `baseURL` to the final URL.
  181. baseURL: history ? history[history.length - 1] : url,
  182. ...cheerioOptions,
  183. };
  184. return decodeStream(opts, (err, $) => (err ? reject(err) : resolve($)));
  185. });
  186. });
  187. // Let's make sure the request is completed before returning the promise.
  188. await undiciStream;
  189. return promise;
  190. }
  191. //# sourceMappingURL=index.js.map