-
Notifications
You must be signed in to change notification settings - Fork 267
Open
Description
Just wanted to drop by and say thanks. It's good to be aware of those techniques. It's insanely complex to not get detected.
What kind of scraping setup do you suggest?
I am currently going with something like this, what do you think?
/**
* This test uses the real Google Chrome browser and not a precompiled puppeteer binary.
*
* Furthermore, we start the browser manually and not with puppeteer.
*/
const puppeteer = require('puppeteer-core');
const exec = require('child_process').exec;
const fs = require('fs');
// change this when necessary
const GOOGLE_CHROME_BINARY = '/usr/bin/google-chrome-stable';
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
function execute(command, callback){
exec(command, function(error, stdout, stderr){ callback(stdout); });
}
/**
* Poll browser.log periodically until we see the wsEndpoint
* that we use to connect to the browser.
*/
async function getWsEndpoint() {
let wsEndointFile = './browser.log';
for (let i = 1; i <= 10; i++) {
await sleep(500);
if (fs.existsSync(wsEndointFile)) {
let logContents = fs.readFileSync(wsEndointFile).toString();
var regex = /DevTools listening on (.*)/gi;
let match = regex.exec(logContents);
if (match) {
return match[1];
}
}
}
console.log('Could not get wsEndpoint');
process.exit(0);
}
(async () => {
// start browser
const command = GOOGLE_CHROME_BINARY + ' --remote-debugging-port=9222 --no-first-run --no-default-browser-check 2> browser.log &';
execute(command, (stdout) => {
console.log(stdout);
});
// now connect to the browser
// we do not start the brwoser with puppeteer,
// because we want to influence the startup process
// as little as possible
const browser = await puppeteer.connect({
browserWSEndpoint: await getWsEndpoint(),
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto('https://google.com');
await sleep(1000);
await page.screenshot({path: "bot.png", fullPage: true});
await page.close();
await browser.close();
})();Metadata
Metadata
Assignees
Labels
No labels