index.js 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import puppeteer from 'puppeteer';
  2. import logger from './logger.js';
  3. import scrollToBottomBrowser from './browserUtils/scrollToBottom.js';
  4. class PuppeteerPlugin {
  5. constructor ({
  6. launchOptions = {},
  7. gotoOptions = {},
  8. scrollToBottom = null,
  9. blockNavigation = false
  10. } = {}) {
  11. this.launchOptions = launchOptions;
  12. this.gotoOptions = gotoOptions;
  13. this.scrollToBottom = scrollToBottom;
  14. this.blockNavigation = blockNavigation;
  15. this.browser = null;
  16. this.headers = {};
  17. logger.info('init plugin', { launchOptions, scrollToBottom, blockNavigation });
  18. }
  19. apply (registerAction) {
  20. registerAction('beforeStart', async () => {
  21. this.browser = await puppeteer.launch(this.launchOptions);
  22. });
  23. registerAction('beforeRequest', async ({requestOptions}) => {
  24. if (hasValues(requestOptions.headers)) {
  25. this.headers = Object.assign({}, requestOptions.headers);
  26. }
  27. return {requestOptions};
  28. });
  29. registerAction('afterResponse', async ({response}) => {
  30. const contentType = response.headers['content-type'];
  31. const isHtml = contentType && contentType.split(';')[0] === 'text/html';
  32. if (isHtml) {
  33. const url = response.url;
  34. const page = await this.browser.newPage();
  35. if (hasValues(this.headers)) {
  36. logger.info('set headers to puppeteer page', this.headers);
  37. await page.setExtraHTTPHeaders(this.headers);
  38. }
  39. if (this.blockNavigation) {
  40. await blockNavigation(page, url);
  41. }
  42. await page.goto(url, this.gotoOptions);
  43. if (this.scrollToBottom) {
  44. await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
  45. }
  46. const content = await page.content();
  47. await page.close();
  48. // convert utf-8 -> binary string because website-scraper needs binary
  49. return Buffer.from(content).toString('binary');
  50. } else {
  51. return response.body;
  52. }
  53. });
  54. registerAction('afterFinish', () => this.browser && this.browser.close());
  55. }
  56. }
  57. function hasValues (obj) {
  58. return obj && Object.keys(obj).length > 0;
  59. }
  60. async function scrollToBottom (page, timeout, viewportN) {
  61. logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`);
  62. await page.evaluate(scrollToBottomBrowser, timeout, viewportN);
  63. }
  64. async function blockNavigation (page, url) {
  65. logger.info(`block navigation for puppeteer page from url ${url}`);
  66. page.on('request', req => {
  67. if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== url) {
  68. req.abort('aborted');
  69. } else {
  70. req.continue();
  71. }
  72. });
  73. await page.setRequestInterception(true);
  74. }
  75. export default PuppeteerPlugin;