Voici un script puppeteer pour capturer des emails qui sont présents dans youtube. Pour cela on fait une recherche une youtube en utilisant un scroll, puis on visite une à une chaque vidéo.
Il faut bien entendu créer une base de donnée pour stocker sa liste.
La syntaxe est la suivante:
où
jeu est le mot clé
1000 est le nombre de pages à scroller
liste1 est le nom de la liste
node youtube.js "jeu" 1000 "liste1"
const puppeteer = require('puppeteer');
const mysql = require('mysql');
const config = require('./config.json')
var connection = mysql.createConnection({
host : config.host,
user : config.user,
password : config.password,
database : config.DB,
debug: false
});
connection.connect();
var browser;
var pageOptions = {waitUntil:'domcontentloaded', timeout :90000};
var keyword = process.argv[2];
var limit = process.argv[3];
var liste = process.argv[4];
console.log(keyword, limit, liste);
function extractItems() {
const extractedElements = document.querySelectorAll('#boxes > div.box');
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeInfiniteScrollItems( page, loopCount, scrollDelay = 500) {
try {
let previousHeight;
for(var i=1;i<loopCount;i++) {
console.log('scroll:', i);
await page.evaluate((i) => {
console.log('i',i);
window.scrollTo(0, i*3000);
},i);
//await page.evaluate('window.scrollTo(0, 3000)');
//await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return;
}
async function run() {
try{
if (typeof (browser)!='undefined') {
browser.close();
}
//browser = await puppeteer.launch({timeout :90000, ignoreHTTPSErrors: true, headless: config.headless, args: ['--no-sandbox', '--proxy-server=socks5://127.0.0.1:9050']});
browser = await puppeteer.launch({timeout :90000, ignoreHTTPSErrors: true, headless: config.headless, args: ['--no-sandbox']});
console.log('START RUN');
var page2 = await browser.newPage();
await page2.setViewport({width: 1024, height: 900});
await page2.setRequestInterception(true);
page2.on('request', request => {
if (request.resourceType === 'image' || request.resourceType === 'stylesheet' || request.resourceType === 'font' || request.resourceType === 'media' || request.resourceType === 'object')
request.abort();
else
request.continue();
})
var page = await browser.newPage();
await page.setViewport({width: 1024, height: 900});
await page.setRequestInterception(true);
page.on('request', request => {
if (request.resourceType === 'image' || request.resourceType === 'stylesheet' || request.resourceType === 'font' || request.resourceType === 'media' || request.resourceType === 'object')
request.abort();
else
request.continue();
})
await page.goto(config.beginUrl, pageOptions);
await page.evaluate((keyword) => {
console.log(keyword); // should be defined now
document.querySelector("input#search").value= keyword;
document.querySelector("#search-icon-legacy").click();
}, keyword);
await page.waitFor(4000);
await page.addScriptTag({
url: "https://code.jquery.com/jquery-3.2.1.min.js"
});
await scrapeInfiniteScrollItems(page, limit, 500);
var resultats = await page.evaluate(() => {
var res = [];
var blocs = Array.from(jQuery('.text-wrapper.style-scope.ytd-video-renderer'));
console.log('blocs captured', blocs.length);
for (var i=0;i<blocs.length;i++) {
var bloc = blocs[i];
var channel = $(bloc).find("#byline").text()
var urlchannel = $(bloc).find("a.yt-formatted-string").prop("href");
var url = $(bloc).find('a').eq(0).prop("href");
res.push({url:url, channel:channel, urlchannel:urlchannel });
}
console.log('res captured', res.length);
return res;
});
console.log('resultats len:', resultats.length);
await page.waitFor(5000);
for (var j = 0; j<resultats.length; j++) {
var res = resultats[j];
await page2.goto(res.url, pageOptions);
var email = await page2.evaluate(() => {
var html = document.documentElement.outerHTML;
var email = html.match(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi);
if (email) {
email = email[0];//.join(',');
}
return email;
});
await page2.waitFor(500);
res.email = email;
res.keyword = keyword;
res.liste = liste;
try {
connection.query('INSERT INTO data SET ?', res, function (error, results, fields) {
console.log('inserted', j,'/', resultats.length, res);
});
} catch(e) {
console.log('Error', e);
}
await page.waitFor(10);
}
//await page.waitFor(5000000);
connection.query('INSERT INTO data SET ?', res, function (error, results, fields) { });
process.exit()
} catch (err) {
console.log(err);
console.log('ERROR');
//run();
}
}
run();