the excuse of a search engine broke my scraper again. go back to generating slop you waste of oxygen

This commit is contained in:
2026-03-04 02:07:43 -05:00
parent cdf9164113
commit f324ddd3a7

View File

@@ -520,7 +520,7 @@ class google{
if($alt_ua === true){ if($alt_ua === true){
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
"User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; PPC; 240x320) Opera 8.65 [nl]", "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Accept: text/html, application/xml;q=0.9, */*;q=0.8", "Accept: text/html, application/xml;q=0.9, */*;q=0.8",
"Accept-Language: nl,en;q=0.8", "Accept-Language: nl,en;q=0.8",
"Accept-Encoding: gzip, deflate", "Accept-Encoding: gzip, deflate",
@@ -573,10 +573,8 @@ class google{
public function web($get){ public function web($get){
// this is going to break soon. I wont scrape the answers simply cause its not worth my time. // it broke again. lasted 3 months
// If only their API wasn't such dogshit maybe I wouldnt need to fuck with this. this isn't // lets hope for another solid 3 month
// just a rant. I know a Google engineer is reading this, give me real fucking results
// you worthless sacks of shit
$out = [ $out = [
"status" => "ok", "status" => "ok",
@@ -690,6 +688,7 @@ class google{
throw new Exception("Failed to get HTML"); throw new Exception("Failed to get HTML");
} }
//$html = file_get_contents("scraper/google.html"); //$html = file_get_contents("scraper/google.html");
} }
@@ -698,25 +697,22 @@ class google{
$this->detect_sorry(); $this->detect_sorry();
$this->parsestyles(); $this->parsestyles();
// iterate over results // get javascript images
$containers = $this->scrape_dimg($html);
$this->scrape_imagearr($html);
// get next page
$npt =
$this->fuckhtml $this->fuckhtml
->getElementsByClassName( ->getElementsByAttributeValue(
$this->getstyle([ "aria-label",
"background-color" => "#fff", "More search results",
"margin-bottom" => "10px", "a"
"margin" => "0px 0px 8px",
"box-shadow" => "0 0 0 1px #ebedef"
])
); );
foreach($containers as $container){ if(count($npt) === 0){
$this->fuckhtml->load($container); // maybe we have the npt object from 2nd page, probe for that
//
// Probe for next page container
//
$npt = $npt =
$this->fuckhtml $this->fuckhtml
->getElementsByAttributeValue( ->getElementsByAttributeValue(
@@ -724,283 +720,317 @@ class google{
"Next page", "Next page",
"a" "a"
); );
}
if(count($npt) !== 0){ if(count($npt) !== 0){
// found next page object $out["npt"] =
$out["npt"] = $this->backend->store(
$this->backend->store(
$this->fuckhtml
->getTextContent(
$npt[0]
["attributes"]
["href"]
),
"web",
$proxy
);
continue;
}
//
// Probe for "did you mean" bullshit
//
$ddm =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"font-size" => "20px",
"font-weight" => "bold",
"line-height" => "26px",
"color" => "#1f1f1f",
"height" => "14px",
"padding" => "16px 14px 0px 14px",
"margin" => "0"
])
);
if(
count($ddm) !== 0 &&
strtolower(
$this->fuckhtml $this->fuckhtml
->getTextContent( ->getTextContent(
$ddm[0] $npt[0]["attributes"]["href"]
) ),
) == "people also search for" "web",
){ $proxy
$as =
$this->fuckhtml
->getElementsByTagName("a");
foreach($as as $a){
$out["related"][] =
$this->fuckhtml
->getTextContent(
$a
);
}
continue;
}
//
// Parse normal web results
//
// probe for website ellipsis shit
$ellipsis =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"text-overflow" => "ellipsis",
"white-space" => "nowrap",
"overflow" => "hidden"
])
); );
}
if(count($ellipsis) < 1){ // outer div is .MjjYud
// inner div always contain role="presentation"
// should not happen $outer =
continue; $this->fuckhtml
} ->getElementsByClassName(
"MjjYud",
"div"
);
$title = // used later
$fancycontainer_class =
explode(
" ",
$this->getstyle([
"padding-top" => "4px",
"padding-bottom" => "calc(12px*1)"
]),
2
);
if(count($fancycontainer_class) === 2){
$fancycontainer_class = $fancycontainer_class[1];
}else{
$fancycontainer_class = false;
}
foreach($outer as $container){
$this->fuckhtml->load($container);
// probe for search result
$sprobe =
$this->fuckhtml $this->fuckhtml
->getElementsByTagName( ->getElementsByAttributeValue(
"h3" "role",
); "presentation",
if(count($title) === 0){
// should not happen
continue;
}
$title =
$this->fuckhtml
->getTextContent(
$title[0]
);
// get URL
$as =
$this->fuckhtml
->getElementsByTagName(
"a" "a"
); );
if(count($as) === 0){ if(count($sprobe) !== 0){
// should not happen // we found a search result
continue;
}
$link = $title =
$this->unshiturl( $this->fuckhtml
$as[0] ->getElementsByAttributeValue(
["attributes"] "role",
["href"] "link",
); "div"
);
// grep container separators if(count($title) === 0){
$separator =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"padding" => "16px 14px 12px"
])
);
if(count($separator) < 2){ // should not happen
continue;
// should not happen
continue;
}
$this->fuckhtml->load($separator[1]);
$snippets =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"white-space" => "pre-line",
"word-wrap" => "break-word"
])
);
if(count($snippets) < 2){
// should not happen
continue;
}
// get description
$description =
$this->fuckhtml
->getTextContent(
$snippets[1]
);
// get date from description
$exp_description = explode(" · ", $description, 2);
$date = null;
if(count($exp_description) === 1){
$description = $exp_description[0];
}else{
$date_probe = strtotime($exp_description[0]);
if(
strlen($exp_description[0]) <= 17 &&
$date_probe !== false
){
$date = $date_probe;
$description = $exp_description[1];
} }
}
// get thumb $title =
$thumb_probe = $this->titledots(
$this->fuckhtml $this->fuckhtml
->getElementsByTagName( ->getTextContent(
"img" $title[0]
);
// too lazy to fix this piece of shit
// will probably break soon anyways idgaf
/*
if(count($thumb_probe) === 0){
$thumb = [
"ratio" => null,
"url" => null
];
}else{
$thumb = [
"ratio" => "1:1",
"url" =>
$this->getdimg(
$thumb_probe[0]
["attributes"]
["id"]
) )
]; );
}*/
$thumb = [ // get description
"ratio" => null, // as usual, theres a thousand fucking possible divs for this one
"url" => null
];
// get sublinks // probe for youtube-like description
$sublinks = []; $description =
foreach($as as $a){
$this->fuckhtml->load($a);
$probe =
$this->fuckhtml $this->fuckhtml
->getElementsByClassName( ->getElementsByClassName(
$this->getstyle([ $this->getstyle([
"color" => "#1558d6", "align-items" => "flex-start",
"font-size" => "14px", "display" => "flex",
"line-height" => "20px" "justify-content" => "center",
]) "padding" => "7px 12px",
"padding-right" => "0",
"padding-top" => "0"
]),
"div"
); );
$url = $ratio = "16:9";
$this->unshiturl(
$a["attributes"]["href"] if(count($description) === 0){
// fail. find the one with the image on the right handside
$description =
$this->fuckhtml
->getElementsByAttributeValue(
"style",
"padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:12px",
"div"
);
$ratio = "1:1";
if(count($description) === 0){
// fail. find the one that is used the most
$description =
$this->fuckhtml
->getElementsByAttributeValue(
"style",
"-webkit-line-clamp:3",
"div"
);
if(count($description) === 0){
// last fail. this one appears with divs that have prices
$description =
$this->fuckhtml
->getElementsByAttributeValue(
"style",
"max-width:100vw;grid-area:nke7rc;padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:6px",
"div"
);
}
}
}
if(count($description) === 0){
// should not happen but whatever
$description = null;
}else{
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$description[0]
)
);
}
// probe for date
$desc2 = explode("", $description, 2);
$time = null;
if(count($desc2) === 2){
$time = strtotime($desc2[0]);
if(
strlen($desc2[0]) < 16 &&
$time !== false
){
$description = ltrim($desc2[1]);
}else{
$time = null;
}
}
$thumb = [
"ratio" => null,
"url" => null,
];
// get thumbnail
$images =
$this->fuckhtml
->getElementsByTagName(
"img"
); );
if( foreach($images as $image){
preg_match(
'/^http/',
$url
)
){
if(count($probe) !== 0){ if(isset($image["attributes"]["id"])){
$sublinks[] = [ $thumb = [
"title" => "ratio" => $ratio,
$this->titledots( "url" => $this->getdimg($image["attributes"]["id"])
$this->fuckhtml
->getTextContent(
$probe[0]
)
),
"description" => null,
"date" => null,
"url" => $url
]; ];
} }
} }
// get sublinks
$sublinks = [];
// probe for the fancy version
if($fancycontainer_class !== false){
$fancycontainer =
$this->fuckhtml
->getElementsByClassName(
$fancycontainer_class,
"div"
);
}
if(
$fancycontainer_class !== false &&
count($fancycontainer) !== 0
){
$this->fuckhtml->load($fancycontainer[0]);
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
foreach($as as $a){
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a
),
"description" => null,
"date" => null,
"url" =>
$this->unshiturl(
$a["attributes"]["href"]
)
];
}
}
$out["web"][] = [
"title" => $title,
"description" => $description,
"url" => $this->unshiturl($sprobe[0]["attributes"]["href"]),
"date" => $time,
"type" => "web",
"thumb" => $thumb,
"sublink" => $sublinks,
"table" => []
];
continue;
} }
$out["web"][] = [ // probe for containers with a title header
"title" => $title_header =
$this->titledots( $this->fuckhtml
$title ->getElementsByClassName(
), $this->getstyle([
"description" => "display" => "flex",
$this->titledots( "flex-wrap" => "wrap",
$description "position" => "relative",
), "padding" => "16px"
"url" => $link, ])
"date" => $date, );
"type" => "web",
"thumb" => $thumb, if(count($title_header) !== 0){
"sublink" => $sublinks,
"table" => [] $title_header =
]; strtolower(
$this->fuckhtml
->getTextContent(
$title_header[0]
)
);
switch($title_header){
case "people also search for":
// get all related searches
$relateds =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"display" => "flex",
"height" => "100%",
"flex-direction" => "column",
"max-width" => "100%"
])
);
foreach($relateds as $r){
$out["related"][] =
$this->fuckhtml
->getTextContent(
$r
);
}
break;
}
continue;
}
} }
$out["related"] = array_values(array_unique($out["related"]));
return $out; return $out;
} }
@@ -1324,6 +1354,8 @@ class google{
public function news($get){ public function news($get){
throw new Exception("Broke for now, fuck off lol");
if($get["npt"]){ if($get["npt"]){
[$req, $proxy] = $this->backend->get($get["npt"], "news"); [$req, $proxy] = $this->backend->get($get["npt"], "news");