index.js 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. "use strict";
  2. /**
  3. * @file Batteries-included version of Cheerio. This module includes several
  4. * convenience methods for loading documents from various sources.
  5. */
  6. var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
  7. if (k2 === undefined) k2 = k;
  8. var desc = Object.getOwnPropertyDescriptor(m, k);
  9. if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
  10. desc = { enumerable: true, get: function() { return m[k]; } };
  11. }
  12. Object.defineProperty(o, k2, desc);
  13. }) : (function(o, m, k, k2) {
  14. if (k2 === undefined) k2 = k;
  15. o[k2] = m[k];
  16. }));
  17. var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
  18. Object.defineProperty(o, "default", { enumerable: true, value: v });
  19. }) : function(o, v) {
  20. o["default"] = v;
  21. });
  22. var __exportStar = (this && this.__exportStar) || function(m, exports) {
  23. for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
  24. };
  25. var __importStar = (this && this.__importStar) || function (mod) {
  26. if (mod && mod.__esModule) return mod;
  27. var result = {};
  28. if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
  29. __setModuleDefault(result, mod);
  30. return result;
  31. };
  32. var __importDefault = (this && this.__importDefault) || function (mod) {
  33. return (mod && mod.__esModule) ? mod : { "default": mod };
  34. };
  35. Object.defineProperty(exports, "__esModule", { value: true });
  36. exports.merge = exports.contains = void 0;
  37. exports.loadBuffer = loadBuffer;
  38. exports.stringStream = stringStream;
  39. exports.decodeStream = decodeStream;
  40. exports.fromURL = fromURL;
  41. __exportStar(require("./load-parse.js"), exports);
  42. var static_js_1 = require("./static.js");
  43. Object.defineProperty(exports, "contains", { enumerable: true, get: function () { return static_js_1.contains; } });
  44. Object.defineProperty(exports, "merge", { enumerable: true, get: function () { return static_js_1.merge; } });
  45. const parse5_htmlparser2_tree_adapter_1 = require("parse5-htmlparser2-tree-adapter");
  46. const htmlparser2 = __importStar(require("htmlparser2"));
  47. const parse5_parser_stream_1 = require("parse5-parser-stream");
  48. const encoding_sniffer_1 = require("encoding-sniffer");
  49. const undici = __importStar(require("undici"));
  50. const whatwg_mimetype_1 = __importDefault(require("whatwg-mimetype"));
  51. const node_stream_1 = require("node:stream");
  52. const options_js_1 = require("./options.js");
  53. const load_parse_js_1 = require("./load-parse.js");
  54. /**
  55. * Sniffs the encoding of a buffer, then creates a querying function bound to a
  56. * document created from the buffer.
  57. *
  58. * @category Loading
  59. * @example
  60. *
  61. * ```js
  62. * import * as cheerio from 'cheerio';
  63. *
  64. * const buffer = fs.readFileSync('index.html');
  65. * const $ = cheerio.fromBuffer(buffer);
  66. * ```
  67. *
  68. * @param buffer - The buffer to sniff the encoding of.
  69. * @param options - The options to pass to Cheerio.
  70. * @returns The loaded document.
  71. */
  72. function loadBuffer(buffer, options = {}) {
  73. const opts = (0, options_js_1.flattenOptions)(options);
  74. const str = (0, encoding_sniffer_1.decodeBuffer)(buffer, {
  75. defaultEncoding: (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252',
  76. ...options.encoding,
  77. });
  78. return (0, load_parse_js_1.load)(str, opts);
  79. }
  80. function _stringStream(options, cb) {
  81. var _a;
  82. if (options === null || options === void 0 ? void 0 : options._useHtmlParser2) {
  83. const parser = htmlparser2.createDocumentStream((err, document) => cb(err, (0, load_parse_js_1.load)(document)), options);
  84. return new node_stream_1.Writable({
  85. decodeStrings: false,
  86. write(chunk, _encoding, callback) {
  87. if (typeof chunk !== 'string') {
  88. throw new TypeError('Expected a string');
  89. }
  90. parser.write(chunk);
  91. callback();
  92. },
  93. final(callback) {
  94. parser.end();
  95. callback();
  96. },
  97. });
  98. }
  99. options !== null && options !== void 0 ? options : (options = {});
  100. (_a = options.treeAdapter) !== null && _a !== void 0 ? _a : (options.treeAdapter = parse5_htmlparser2_tree_adapter_1.adapter);
  101. if (options.scriptingEnabled !== false) {
  102. options.scriptingEnabled = true;
  103. }
  104. const stream = new parse5_parser_stream_1.ParserStream(options);
  105. (0, node_stream_1.finished)(stream, (err) => cb(err, (0, load_parse_js_1.load)(stream.document)));
  106. return stream;
  107. }
  108. /**
  109. * Creates a stream that parses a sequence of strings into a document.
  110. *
  111. * The stream is a `Writable` stream that accepts strings. When the stream is
  112. * finished, the callback is called with the loaded document.
  113. *
  114. * @category Loading
  115. * @example
  116. *
  117. * ```js
  118. * import * as cheerio from 'cheerio';
  119. * import * as fs from 'fs';
  120. *
  121. * const writeStream = cheerio.stringStream({}, (err, $) => {
  122. * if (err) {
  123. * // Handle error
  124. * }
  125. *
  126. * console.log($('h1').text());
  127. * // Output: Hello, world!
  128. * });
  129. *
  130. * fs.createReadStream('my-document.html', { encoding: 'utf8' }).pipe(
  131. * writeStream,
  132. * );
  133. * ```
  134. *
  135. * @param options - The options to pass to Cheerio.
  136. * @param cb - The callback to call when the stream is finished.
  137. * @returns The writable stream.
  138. */
  139. function stringStream(options, cb) {
  140. return _stringStream((0, options_js_1.flattenOptions)(options), cb);
  141. }
  142. /**
  143. * Parses a stream of buffers into a document.
  144. *
  145. * The stream is a `Writable` stream that accepts buffers. When the stream is
  146. * finished, the callback is called with the loaded document.
  147. *
  148. * @category Loading
  149. * @param options - The options to pass to Cheerio.
  150. * @param cb - The callback to call when the stream is finished.
  151. * @returns The writable stream.
  152. */
  153. function decodeStream(options, cb) {
  154. var _a;
  155. const { encoding = {}, ...cheerioOptions } = options;
  156. const opts = (0, options_js_1.flattenOptions)(cheerioOptions);
  157. // Set the default encoding to UTF-8 for XML mode
  158. (_a = encoding.defaultEncoding) !== null && _a !== void 0 ? _a : (encoding.defaultEncoding = (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252');
  159. const decodeStream = new encoding_sniffer_1.DecodeStream(encoding);
  160. const loadStream = _stringStream(opts, cb);
  161. decodeStream.pipe(loadStream);
  162. return decodeStream;
  163. }
  164. const defaultRequestOptions = {
  165. method: 'GET',
  166. // Allow redirects by default
  167. maxRedirections: 5,
  168. // NOTE: `throwOnError` currently doesn't work https://github.com/nodejs/undici/issues/1753
  169. throwOnError: true,
  170. // Set an Accept header
  171. headers: {
  172. accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  173. },
  174. };
  175. /**
  176. * `fromURL` loads a document from a URL.
  177. *
  178. * By default, redirects are allowed and non-2xx responses are rejected.
  179. *
  180. * @category Loading
  181. * @example
  182. *
  183. * ```js
  184. * import * as cheerio from 'cheerio';
  185. *
  186. * const $ = await cheerio.fromURL('https://example.com');
  187. * ```
  188. *
  189. * @param url - The URL to load the document from.
  190. * @param options - The options to pass to Cheerio.
  191. * @returns The loaded document.
  192. */
  193. async function fromURL(url, options = {}) {
  194. var _a;
  195. const { requestOptions = defaultRequestOptions, encoding = {}, ...cheerioOptions } = options;
  196. let undiciStream;
  197. // Add headers if none were supplied.
  198. (_a = requestOptions.headers) !== null && _a !== void 0 ? _a : (requestOptions.headers = defaultRequestOptions.headers);
  199. const promise = new Promise((resolve, reject) => {
  200. undiciStream = undici.stream(url, requestOptions, (res) => {
  201. var _a, _b;
  202. const contentType = (_a = res.headers['content-type']) !== null && _a !== void 0 ? _a : 'text/html';
  203. const mimeType = new whatwg_mimetype_1.default(Array.isArray(contentType) ? contentType[0] : contentType);
  204. if (!mimeType.isHTML() && !mimeType.isXML()) {
  205. throw new RangeError(`The content-type "${contentType}" is neither HTML nor XML.`);
  206. }
  207. // Forward the charset from the header to the decodeStream.
  208. encoding.transportLayerEncodingLabel = mimeType.parameters.get('charset');
  209. /*
  210. * If we allow redirects, we will have entries in the history.
  211. * The last entry will be the final URL.
  212. */
  213. const history = (_b = res.context) === null || _b === void 0 ? void 0 : _b.history;
  214. const opts = {
  215. encoding,
  216. // Set XML mode based on the MIME type.
  217. xmlMode: mimeType.isXML(),
  218. // Set the `baseURL` to the final URL.
  219. baseURL: history ? history[history.length - 1] : url,
  220. ...cheerioOptions,
  221. };
  222. return decodeStream(opts, (err, $) => (err ? reject(err) : resolve($)));
  223. });
  224. });
  225. // Let's make sure the request is completed before returning the promise.
  226. await undiciStream;
  227. return promise;
  228. }
  229. //# sourceMappingURL=index.js.map