Comment capturer des emails dans youtube

Voici un script puppeteer pour capturer des emails qui sont présents dans youtube. Pour cela on fait une recherche une youtube en utilisant un scroll, puis on visite une à une chaque vidéo.
Il faut bien entendu créer une base de donnée pour stocker sa liste.

La syntaxe est la suivante:


jeu est le mot clé
1000 est le nombre de pages à scroller
liste1 est le nom de la liste

node youtube.js "jeu" 1000 "liste1"
const puppeteer = require('puppeteer');
const mysql = require('mysql');
const config = require('./config.json')

var connection = mysql.createConnection({
    host     : config.host,
    user     : config.user,
    password : config.password,
    database : config.DB,
    debug: false
});
connection.connect();

var browser;
var pageOptions = {waitUntil:'domcontentloaded', timeout :90000};

var keyword = process.argv[2];
var limit = process.argv[3];
var liste = process.argv[4];
console.log(keyword, limit, liste);


function extractItems() {
    const extractedElements = document.querySelectorAll('#boxes > div.box');
    const items = [];
    for (let element of extractedElements) {
        items.push(element.innerText);
    }
    return items;
}

async function scrapeInfiniteScrollItems(   page, loopCount,  scrollDelay = 500) {
    try {
        let previousHeight;
        for(var i=1;i<loopCount;i++) {

            console.log('scroll:', i);
            await page.evaluate((i) => {
                console.log('i',i);
                window.scrollTo(0, i*3000);
            },i);
            //await page.evaluate('window.scrollTo(0, 3000)');
            //await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
            await page.waitFor(scrollDelay);
        }
    } catch(e) { }
    return;
}

async function run() {
    try{
        if (typeof (browser)!='undefined') {
            browser.close();
        }
        //browser = await puppeteer.launch({timeout :90000, ignoreHTTPSErrors: true, headless: config.headless, args: ['--no-sandbox', '--proxy-server=socks5://127.0.0.1:9050']});
        browser = await puppeteer.launch({timeout :90000, ignoreHTTPSErrors: true, headless: config.headless, args: ['--no-sandbox']});
        console.log('START RUN');

        var page2 = await browser.newPage();
        await page2.setViewport({width: 1024, height: 900});
        await page2.setRequestInterception(true);
        page2.on('request', request => {
            if (request.resourceType === 'image' || request.resourceType === 'stylesheet' || request.resourceType === 'font' || request.resourceType === 'media' || request.resourceType === 'object')
                request.abort();
            else
                request.continue();
        })


            var page = await browser.newPage();
            await page.setViewport({width: 1024, height: 900});
            await page.setRequestInterception(true);

            page.on('request', request => {
                if (request.resourceType === 'image' || request.resourceType === 'stylesheet' || request.resourceType === 'font' || request.resourceType === 'media' || request.resourceType === 'object')
                    request.abort();
                else
                    request.continue();
            })


        await page.goto(config.beginUrl,  pageOptions);
        await page.evaluate((keyword) => {
            console.log(keyword); // should be defined now
            document.querySelector("input#search").value= keyword;
            document.querySelector("#search-icon-legacy").click();
        }, keyword);

        await page.waitFor(4000);
        await page.addScriptTag({
            url: "https://code.jquery.com/jquery-3.2.1.min.js"
        });
        await scrapeInfiniteScrollItems(page, limit, 500);

        var resultats = await page.evaluate(() => {
            var res = [];
            var blocs = Array.from(jQuery('.text-wrapper.style-scope.ytd-video-renderer'));
            console.log('blocs captured', blocs.length);
            for (var i=0;i<blocs.length;i++) {
                var bloc = blocs[i];
                var channel = $(bloc).find("#byline").text()
                var urlchannel = $(bloc).find("a.yt-formatted-string").prop("href");
                var url = $(bloc).find('a').eq(0).prop("href");
                res.push({url:url, channel:channel, urlchannel:urlchannel });
            }
            console.log('res captured', res.length);
            return res;
        });

        console.log('resultats len:', resultats.length);
        await page.waitFor(5000);
        for (var j = 0; j<resultats.length; j++) {

            var res = resultats[j];
            await page2.goto(res.url, pageOptions);
            var email = await page2.evaluate(() => {
                var html = document.documentElement.outerHTML;
                var email = html.match(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi);
                if (email) {
                    email = email[0];//.join(',');
                }
                return email;
            });
            await page2.waitFor(500);
            res.email = email;
            res.keyword = keyword;
            res.liste = liste;
            try {
                connection.query('INSERT INTO data SET ?', res, function (error, results, fields) {
                    console.log('inserted', j,'/', resultats.length,  res);
                });
            } catch(e) {
                console.log('Error', e);
            }
            await page.waitFor(10);
        }
        //await page.waitFor(5000000);
        connection.query('INSERT INTO data SET  ?', res, function (error, results, fields) {     });
        process.exit()

    } catch (err) {
        console.log(err);
        console.log('ERROR');
        //run();
    }
}
    run();