Skip to content

Thanks #1

@NikolaiT

Description

@NikolaiT

Just wanted to drop by and say thanks. It's good to be aware of those techniques. It's insanely complex to not get detected.

What kind of scraping setup do you suggest?

I am currently going with something like this, what do you think?

/**
* This test uses the real Google Chrome browser and not a precompiled puppeteer binary.
* 
* Furthermore, we start the browser manually and not with puppeteer.
*/
const puppeteer = require('puppeteer-core');
const exec = require('child_process').exec;
const fs = require('fs');

// change this when necessary
const GOOGLE_CHROME_BINARY = '/usr/bin/google-chrome-stable';

function sleep(ms) {
 return new Promise(resolve => setTimeout(resolve, ms));
}

function execute(command, callback){
 exec(command, function(error, stdout, stderr){ callback(stdout); });
}

/**
* Poll browser.log periodically until we see the wsEndpoint
* that we use to connect to the browser.
*/
async function getWsEndpoint() {
 let wsEndointFile = './browser.log';
 for (let i = 1; i <= 10; i++) {
   await sleep(500);
   if (fs.existsSync(wsEndointFile)) {
     let logContents = fs.readFileSync(wsEndointFile).toString();
     var regex = /DevTools listening on (.*)/gi;
     let match = regex.exec(logContents);
     if (match) {
       return match[1];
     }
   }
 }
 console.log('Could not get wsEndpoint');
 process.exit(0);
}

(async () => {
 // start browser
 const command = GOOGLE_CHROME_BINARY + ' --remote-debugging-port=9222 --no-first-run --no-default-browser-check 2> browser.log &';
 execute(command, (stdout) => {
   console.log(stdout);
 });

 // now connect to the browser
 // we do not start the brwoser with puppeteer,
 // because we want to influence the startup process
 // as little as possible
 const browser = await puppeteer.connect({
   browserWSEndpoint: await getWsEndpoint(),
   defaultViewport: null,
 });

 const page = await browser.newPage();
 await page.goto('https://google.com');
 await sleep(1000);
 await page.screenshot({path: "bot.png", fullPage: true});

 await page.close();
 await browser.close();
})();

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions