1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495 |
- import puppeteer from 'puppeteer';
- import logger from './logger.js';
- import scrollToBottomBrowser from './browserUtils/scrollToBottom.js';
- class PuppeteerPlugin {
- constructor ({
- launchOptions = {},
- gotoOptions = {},
- scrollToBottom = null,
- blockNavigation = false
- } = {}) {
- this.launchOptions = launchOptions;
- this.gotoOptions = gotoOptions;
- this.scrollToBottom = scrollToBottom;
- this.blockNavigation = blockNavigation;
- this.browser = null;
- this.headers = {};
- logger.info('init plugin', { launchOptions, scrollToBottom, blockNavigation });
- }
- apply (registerAction) {
- registerAction('beforeStart', async () => {
- this.browser = await puppeteer.launch(this.launchOptions);
- });
- registerAction('beforeRequest', async ({requestOptions}) => {
- if (hasValues(requestOptions.headers)) {
- this.headers = Object.assign({}, requestOptions.headers);
- }
- return {requestOptions};
- });
- registerAction('afterResponse', async ({response}) => {
- const contentType = response.headers['content-type'];
- const isHtml = contentType && contentType.split(';')[0] === 'text/html';
- if (isHtml) {
- const url = response.url;
- const page = await this.browser.newPage();
- if (hasValues(this.headers)) {
- logger.info('set headers to puppeteer page', this.headers);
- await page.setExtraHTTPHeaders(this.headers);
- }
- if (this.blockNavigation) {
- await blockNavigation(page, url);
- }
- await page.goto(url, this.gotoOptions);
- if (this.scrollToBottom) {
- await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
- }
- const content = await page.content();
- await page.close();
- // convert utf-8 -> binary string because website-scraper needs binary
- return Buffer.from(content).toString('binary');
- } else {
- return response.body;
- }
- });
- registerAction('afterFinish', () => this.browser && this.browser.close());
- }
- }
- function hasValues (obj) {
- return obj && Object.keys(obj).length > 0;
- }
- async function scrollToBottom (page, timeout, viewportN) {
- logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`);
- await page.evaluate(scrollToBottomBrowser, timeout, viewportN);
- }
- async function blockNavigation (page, url) {
- logger.info(`block navigation for puppeteer page from url ${url}`);
- page.on('request', req => {
- if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== url) {
- req.abort('aborted');
- } else {
- req.continue();
- }
- });
- await page.setRequestInterception(true);
- }
- export default PuppeteerPlugin;
|