fix scrape failures on google

This commit is contained in:
2026-03-05 21:06:19 -05:00
parent 4e247c3ac4
commit 2e5edda85b

View File

@@ -522,7 +522,7 @@ class google{
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Accept: text/html, application/xml;q=0.9, */*;q=0.8", "Accept: text/html, application/xml;q=0.9, */*;q=0.8",
"Accept-Language: nl,en;q=0.8", "Accept-Language: en-US,en;q=0.8",
"Accept-Encoding: gzip, deflate", "Accept-Encoding: gzip, deflate",
"Connection: Keep-Alive", "Connection: Keep-Alive",
"Cache-Control: no-cache" "Cache-Control: no-cache"
@@ -770,18 +770,6 @@ class google{
$this->fuckhtml->load($container); $this->fuckhtml->load($container);
// probe for search result // probe for search result
$sprobe =
$this->fuckhtml
->getElementsByAttributeValue(
"role",
"presentation",
"a"
);
if(count($sprobe) !== 0){
// we found a search result
$title = $title =
$this->fuckhtml $this->fuckhtml
->getElementsByAttributeValue( ->getElementsByAttributeValue(
@@ -790,11 +778,9 @@ class google{
"div" "div"
); );
if(count($title) === 0){ if(count($title) !== 0){
// should not happen // we found a search result
continue;
}
$title = $title =
$this->titledots( $this->titledots(
@@ -804,6 +790,41 @@ class google{
) )
); );
// get url
$sprobe =
$this->fuckhtml
->getElementsByTagName(
"a"
);
$link = null;
foreach($sprobe as $possible_link){
if(
isset($possible_link["attributes"]["href"]) &&
preg_match(
'/^\/url\?q=/',
$possible_link["attributes"]["href"]
)
){
$link =
$this->fuckhtml
->getTextContent(
$possible_link["attributes"]["href"]
);
break;
}
}
if($link === null){
// should not happen
continue;
}
// get description // get description
// as usual, theres a thousand fucking possible divs for this one // as usual, theres a thousand fucking possible divs for this one
@@ -968,7 +989,7 @@ class google{
$out["web"][] = [ $out["web"][] = [
"title" => $title, "title" => $title,
"description" => $description, "description" => $description,
"url" => $this->unshiturl($sprobe[0]["attributes"]["href"]), "url" => $this->unshiturl($link),
"date" => $time, "date" => $time,
"type" => "web", "type" => "web",
"thumb" => $thumb, "thumb" => $thumb,