import { spawn } from 'child_process'; import WebSocket from 'ws'; import fetch from 'node-fetch'; const CHROME_PATH = 'chromium'; const DEBUG_PORT = 9222; let msgId = 1; var pending = new Map(); var rawhtml = new Map(); var ws = null; // // Close chrome on script exit // process.on("exit", function(){ try{ process.kill(-chrome.pid); }catch{ console.log("Chrome is still running somehow, kill it"); } }); // // FUNctions // async function poll_debugger(port, timeout = 20000){ const start = Date.now(); return new Promise((resolve, reject) => { const check = async function(){ try { const res = await fetch(`http://localhost:${port}/json/version`); if(res.ok){ resolve(); return; } }catch(e){ // not ready, do nothing } if(Date.now() - start > timeout){ reject(new Error("Chrome debugger did not respond")); }else{ setTimeout(check, 150); } } check(); }); } function send_command(method, params = {}){ const id = msgId++; ws.send(JSON.stringify({id, method, params})); return id; } async function send_command_wait(method, params = {}, timeout = 5000) { const id = send_command(method, params); return new Promise((resolve, reject) => { const timer = setTimeout( function(){ pending.delete(id); reject(new Error(`Timeout waiting for ${method}`)); }, timeout ); pending.set(id, {resolve, reject, timer}); }); } // // Spawn chrome // console.log(` Creating chrome instance @ ${CHROME_PATH} with debugger port ${DEBUG_PORT}`); const chrome = spawn( CHROME_PATH, [ `--remote-debugging-port=${DEBUG_PORT}`, '--incognito', '--no-first-run', '--no-default-browser-check', '--password-store=basic', //'--proxy-server=https://blade1.frankfurt-rack444.nodes.gen4.ninja:9002', //'--user-data-dir=/tmp/chrome-temp-profile', '--window-size=1200,800' ], { detached: true, stdio: "ignore" } ); chrome.unref(); // // Connect to debugger // async function doshit(){ console.log(" Waiting for chrome bloatware..."); await poll_debugger(DEBUG_PORT); console.log(" Debugger alive"); const targets = await(await fetch(`http://localhost:${DEBUG_PORT}/json`)).json(); const page = targets.find(t => t.type === "page"); if(!page){ console.log(" Failed to find a page object"); process.kill(-chrome.pid); } ws = new WebSocket(page.webSocketDebuggerUrl); // // Navigate to the page on open // ws.on("open", async function(){ console.log(` Connected to debugger @ "${page.title}" (${page.url})`); await send_command_wait("Network.enable"); await send_command_wait("Page.enable"); console.log(" Registered to debugger events"); await send_command_wait( "Page.navigate", { url: "https://www.google.com" } ); }); // // Handle incoming websocket messages // ws.on("message", async function(msg){ var data = JSON.parse(msg); // resolve promises if need be if(data.id && pending.has(data.id)){ const {resolve, reject, timer} = pending.get(data.id); clearTimeout(timer); pending.delete(data.id); if(data.error){ reject(data.error); }else{ resolve(data.result); } } // create map of all raw html payloads if(data.method == "Network.requestWillBeSent"){ if(data.params.type == "Document"){ rawhtml.set(data.params.frameId, data.params.requestId); } } // log pages visited if( data.method && data.method == "Page.frameNavigated" && !data.params.frame.parentId ){ var url = data.params.frame.url; if(url.match(/^https?:\/\/(www\.)?google\.[a-z]{1,10}\/search\?/g)){ if(url.match(/sei=/)){ console.log(" Google returned a search page!"); setTimeout(async function(){ // dump cookies var cookie_raw = await send_command_wait("Network.getCookies", {urls: ["https://google.com"]}, 3000); var cookie = []; cookie_raw.cookies.forEach(function(c){ cookie.push(c.name + "=" + c.value); }); cookie = cookie.join("; "); console.log(" Scraped cookies"); // dump raw html var html = await send_command_wait( "Network.getResponseBody", { requestId: rawhtml.get(data.params.frame.id) } ); html = html.base64Encoded ? atob(html.body) : html.body; console.log(" Scraped HTML"); process.kill(-chrome.pid); console.log({ status: "ok", cookie: cookie, raw_html: html }); }, 1000 + Math.random()); // }else{ console.log(" Obtaining search token..."); } }else if(url.match(/\/sorry\//)){ console.log(" FAIL: Got captcha page"); process.kill(-chrome.pid); }else if(url == "https://www.google.com/"){ // remove GPDR prompt console.log(" Clearing GPDR prompt"); await send_command_wait( "Runtime.evaluate", { expression: 'window.addEventListener("load", function(){' + // click out of the GPDR prompt bullshit, if its there 'var btns = Array.from(document.getElementsByTagName("button"));' + 'if(btns.length >= 7){' + 'btns[5].click();' + '}' + '});' } ); console.log(" Sleeping for ~3 seconds"); // do actual search setTimeout(async function(){ console.log(" Sending search command"); await send_command_wait( "Runtime.evaluate", { expression: 'document.getElementsByTagName("textarea")[0].value = "asmr";' + 'document.getElementsByName("btnK")[0].click();' } ); }, 3100 + Math.random()); }else{ // this shouldn't really trigger console.log(` Visited ${data.params.frame.url}`); } } }); } doshit();