From 2e5edda85b341074afd48a6a3c37ad9e2b249679 Mon Sep 17 00:00:00 2001 From: lolcat Date: Thu, 5 Mar 2026 21:06:19 -0500 Subject: [PATCH] fix scrape failures on google --- scraper/google.php | 61 +++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/scraper/google.php b/scraper/google.php index 81a5e2d..2f71e0e 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -522,7 +522,7 @@ class google{ curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Accept: text/html, application/xml;q=0.9, */*;q=0.8", - "Accept-Language: nl,en;q=0.8", + "Accept-Language: en-US,en;q=0.8", "Accept-Encoding: gzip, deflate", "Connection: Keep-Alive", "Cache-Control: no-cache" @@ -770,32 +770,18 @@ class google{ $this->fuckhtml->load($container); // probe for search result - $sprobe = + $title = $this->fuckhtml ->getElementsByAttributeValue( "role", - "presentation", - "a" + "link", + "div" ); - if(count($sprobe) !== 0){ + if(count($title) !== 0){ // we found a search result - $title = - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "link", - "div" - ); - - if(count($title) === 0){ - - // should not happen - continue; - } - $title = $this->titledots( $this->fuckhtml @@ -804,6 +790,41 @@ class google{ ) ); + // get url + $sprobe = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $link = null; + + foreach($sprobe as $possible_link){ + + if( + isset($possible_link["attributes"]["href"]) && + preg_match( + '/^\/url\?q=/', + $possible_link["attributes"]["href"] + ) + ){ + + $link = + $this->fuckhtml + ->getTextContent( + $possible_link["attributes"]["href"] + ); + + break; + } + } + + if($link === null){ + + // should not happen + continue; + } + // get description // as usual, theres a thousand fucking possible divs for this one @@ -968,7 +989,7 @@ class google{ $out["web"][] = [ "title" => $title, "description" => $description, - "url" => $this->unshiturl($sprobe[0]["attributes"]["href"]), + "url" => $this->unshiturl($link), "date" => $time, "type" => "web", "thumb" => $thumb,